diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm')
96 files changed, 7348 insertions, 2209 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/APRIORI.java b/src/de/lmu/ifi/dbs/elki/algorithm/APRIORI.java index 07aaf3fc..a2f32989 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/APRIORI.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/APRIORI.java @@ -44,8 +44,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.OneMustBeSetGlobalConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.OnlyOneIsAllowedToBeSetGlobalConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; @@ -296,7 +295,7 @@ public class APRIORI extends AbstractAlgorithm<AprioriResult> { public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(TypeUtil.BIT_VECTOR_FIELD); } - + @Override protected Logging getLogger() { return LOG; @@ -325,15 +324,15 @@ public class APRIORI extends AbstractAlgorithm<AprioriResult> { super.makeOptions(config); DoubleParameter minfreqP = new DoubleParameter(MINFREQ_ID); minfreqP.setOptional(true); - minfreqP.addConstraint(new GreaterEqualConstraint(0)); - minfreqP.addConstraint(new LessEqualConstraint(1)); + minfreqP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE); + minfreqP.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE); if(config.grab(minfreqP)) { minfreq = minfreqP.getValue(); } IntParameter minsuppP = new IntParameter(MINSUPP_ID); minsuppP.setOptional(true); - minsuppP.addConstraint(new GreaterEqualConstraint(0)); + minsuppP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT); if(config.grab(minsuppP)) { minsupp = minsuppP.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/AbstractAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/AbstractAlgorithm.java index 68ac9595..65b86633 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/AbstractAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/AbstractAlgorithm.java @@ -85,72 +85,48 @@ public abstract class AbstractAlgorithm<R extends Result> implements Algorithm { } // Find appropriate run method. - Method runmethod1 = null; - Method runmethod2 = null; try { - runmethod1 = this.getClass().getMethod("run", signature1); - runmethod2 = null; - } - catch(SecurityException e) { - throw new APIViolationException("Security exception finding an appropriate 'run' method.", e); + Method runmethod1 = this.getClass().getMethod("run", signature1); + return (R) runmethod1.invoke(this, relations1); } catch(NoSuchMethodException e) { - runmethod1 = null; - // Try without "database" parameter. - try { - runmethod2 = this.getClass().getMethod("run", signature2); - } - catch(NoSuchMethodException e2) { - runmethod2 = null; + // continue below. + } + catch(IllegalArgumentException | IllegalAccessException | SecurityException e) { + throw new APIViolationException("Invoking the real 'run' method failed.", e); + } + catch(InvocationTargetException e) { + final Throwable cause = e.getTargetException(); + if(cause instanceof RuntimeException) { + throw (RuntimeException) cause; } - catch(SecurityException e2) { - throw new APIViolationException("Security exception finding an appropriate 'run' method.", e2); + if(cause instanceof Error) { + throw (Error) cause; } + throw new APIViolationException("Invoking the real 'run' method failed: " + cause.toString(), cause); } - if(runmethod1 != null) { - try { - return (R) runmethod1.invoke(this, relations1); - } - catch(IllegalArgumentException e) { - throw new APIViolationException("Invoking the real 'run' method failed.", e); - } - catch(IllegalAccessException e) { - throw new APIViolationException("Invoking the real 'run' method failed.", e); - } - catch(InvocationTargetException e) { - if(e.getTargetException() instanceof RuntimeException) { - throw (RuntimeException) e.getTargetException(); - } - if(e.getTargetException() instanceof AssertionError) { - throw (AssertionError) e.getTargetException(); - } - throw new APIViolationException("Invoking the real 'run' method failed: " + e.getTargetException().toString(), e.getTargetException()); - } + try { + Method runmethod2 = this.getClass().getMethod("run", signature2); + return (R) runmethod2.invoke(this, relations2); } - else if(runmethod2 != null) { - try { - return (R) runmethod2.invoke(this, relations2); - } - catch(IllegalArgumentException e) { - throw new APIViolationException("Invoking the real 'run' method failed.", e); - } - catch(IllegalAccessException e) { - throw new APIViolationException("Invoking the real 'run' method failed.", e); + catch(NoSuchMethodException e) { + // continue below. + } + catch(IllegalArgumentException | IllegalAccessException | SecurityException e) { + throw new APIViolationException("Invoking the real 'run' method failed.", e); + } + catch(InvocationTargetException e) { + final Throwable cause = e.getTargetException(); + if(cause instanceof RuntimeException) { + throw (RuntimeException) cause; } - catch(InvocationTargetException e) { - if(e.getTargetException() instanceof RuntimeException) { - throw (RuntimeException) e.getTargetException(); - } - if(e.getTargetException() instanceof AssertionError) { - throw (AssertionError) e.getTargetException(); - } - throw new APIViolationException("Invoking the real 'run' method failed: " + e.getTargetException().toString(), e.getTargetException()); + if(cause instanceof Error) { + throw (Error) cause; } + throw new APIViolationException("Invoking the real 'run' method failed: " + cause.toString(), cause); } - else { - throw new APIViolationException("No appropriate 'run' method found."); - } + throw new APIViolationException("No appropriate 'run' method found."); } /** @@ -177,6 +153,6 @@ public abstract class AbstractAlgorithm<R extends Result> implements Algorithm { * @return Parameter object */ public static <F extends DistanceFunction<?, ?>> ObjectParameter<F> makeParameterDistanceFunction(Class<?> defaultDistanceFunction, Class<?> restriction) { - return new ObjectParameter<>(AbstractDistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, restriction, defaultDistanceFunction); + return new ObjectParameter<>(DistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, restriction, defaultDistanceFunction); } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/AbstractPrimitiveDistanceBasedAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/AbstractPrimitiveDistanceBasedAlgorithm.java index 40fe67c3..5d4b24c1 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/AbstractPrimitiveDistanceBasedAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/AbstractPrimitiveDistanceBasedAlgorithm.java @@ -48,7 +48,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; public abstract class AbstractPrimitiveDistanceBasedAlgorithm<O, D extends Distance<?>, R extends Result> extends AbstractAlgorithm<R> { /** * Holds the instance of the distance function specified by - * {@link AbstractDistanceBasedAlgorithm#DISTANCE_FUNCTION_ID}. + * {@link DistanceBasedAlgorithm#DISTANCE_FUNCTION_ID}. */ protected PrimitiveDistanceFunction<? super O, D> distanceFunction; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/DependencyDerivator.java b/src/de/lmu/ifi/dbs/elki/algorithm/DependencyDerivator.java index cc40d13b..dca3649e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/DependencyDerivator.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/DependencyDerivator.java @@ -51,8 +51,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -68,7 +67,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; * E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, A. Zimek: Deriving * Quantitative Dependencies for Correlation Clusters. <br> * In Proc. 12th Int. Conf. on Knowledge Discovery and Data Mining (KDD '06), - * Philadelphia, PA 2006. </p> + * Philadelphia, PA 2006. + * </p> * * @author Arthur Zimek * @param <V> the type of FeatureVector handled by this Algorithm @@ -303,20 +303,20 @@ public class DependencyDerivator<V extends NumberVector<?>, D extends Distance<D @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - + IntParameter outputAccuracyP = new IntParameter(OUTPUT_ACCURACY_ID, 4); - outputAccuracyP.addConstraint(new GreaterEqualConstraint(0)); + outputAccuracyP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT); if(config.grab(outputAccuracyP)) { outputAccuracy = outputAccuracyP.getValue(); } - + IntParameter sampleSizeP = new IntParameter(SAMPLE_SIZE_ID); sampleSizeP.setOptional(true); - sampleSizeP.addConstraint(new GreaterConstraint(0)); + sampleSizeP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(sampleSizeP)) { sampleSize = sampleSizeP.getValue(); } - + Flag randomSampleF = new Flag(DEPENDENCY_DERIVATOR_RANDOM_SAMPLE); if(config.grab(randomSampleF)) { randomSample = randomSampleF.getValue(); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/KNNDistanceOrder.java b/src/de/lmu/ifi/dbs/elki/algorithm/KNNDistanceOrder.java index b696ed36..46cf2246 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/KNNDistanceOrder.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/KNNDistanceOrder.java @@ -43,8 +43,7 @@ import de.lmu.ifi.dbs.elki.result.KNNDistanceOrderResult; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -164,14 +163,14 @@ public class KNNDistanceOrder<O, D extends Distance<D>> extends AbstractDistance protected void makeOptions(Parameterization config) { super.makeOptions(config); IntParameter kP = new IntParameter(K_ID, 1); - kP.addConstraint(new GreaterConstraint(0)); + kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(kP)) { k = kP.getValue(); } DoubleParameter percentageP = new DoubleParameter(PERCENTAGE_ID, 1.0); - percentageP.addConstraint(new GreaterConstraint(0)); - percentageP.addConstraint(new LessEqualConstraint(1)); + percentageP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); + percentageP.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE); if(config.grab(percentageP)) { percentage = percentageP.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/KNNJoin.java b/src/de/lmu/ifi/dbs/elki/algorithm/KNNJoin.java index dddd8fdb..0f5078fb 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/KNNJoin.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/KNNJoin.java @@ -61,7 +61,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -121,11 +121,11 @@ public class KNNJoin<V extends NumberVector<?>, D extends Distance<D>, N extends */ @SuppressWarnings("unchecked") public WritableDataStore<KNNList<D>> run(Database database, Relation<V> relation) { - if (!(getDistanceFunction() instanceof SpatialPrimitiveDistanceFunction)) { + if(!(getDistanceFunction() instanceof SpatialPrimitiveDistanceFunction)) { throw new IllegalStateException("Distance Function must be an instance of " + SpatialPrimitiveDistanceFunction.class.getName()); } Collection<SpatialIndexTree<N, E>> indexes = ResultUtil.filterResults(database, SpatialIndexTree.class); - if (indexes.size() != 1) { + if(indexes.size() != 1) { throw new AbortException("KNNJoin found " + indexes.size() + " spatial indexes, expected exactly one."); } // FIXME: Ensure were looking at the right relation! @@ -140,7 +140,7 @@ public class KNNJoin<V extends NumberVector<?>, D extends Distance<D>, N extends ComparableMinHeap<Task> pq = new ComparableMinHeap<>(ps_candidates.size() * ps_candidates.size() / 10); // Initialize with the page self-pairing - for (int i = 0; i < ps_candidates.size(); i++) { + for(int i = 0; i < ps_candidates.size(); i++) { E pr_entry = ps_candidates.get(i); N pr = index.getNode(pr_entry); heaps.add(initHeaps(distFunction, pr)); @@ -148,41 +148,42 @@ public class KNNJoin<V extends NumberVector<?>, D extends Distance<D>, N extends // Build priority queue final int sqsize = ps_candidates.size() * (ps_candidates.size() - 1) >> 1; - if (LOG.isDebuggingFine()) { + if(LOG.isDebuggingFine()) { LOG.debugFine("Number of leaves: " + ps_candidates.size() + " so " + sqsize + " MBR computations."); } FiniteProgress mprogress = LOG.isVerbose() ? new FiniteProgress("Comparing leaf MBRs", sqsize, LOG) : null; - for (int i = 0; i < ps_candidates.size(); i++) { + for(int i = 0; i < ps_candidates.size(); i++) { E pr_entry = ps_candidates.get(i); List<KNNHeap<D>> pr_heaps = heaps.get(i); D pr_knn_distance = computeStopDistance(pr_heaps); - for (int j = i + 1; j < ps_candidates.size(); j++) { + for(int j = i + 1; j < ps_candidates.size(); j++) { E ps_entry = ps_candidates.get(j); List<KNNHeap<D>> ps_heaps = heaps.get(j); D ps_knn_distance = computeStopDistance(ps_heaps); D minDist = distFunction.minDist(pr_entry, ps_entry); // Resolve immediately: - if (minDist.isNullDistance()) { + if(minDist.isNullDistance()) { N pr = index.getNode(ps_candidates.get(i)); N ps = index.getNode(ps_candidates.get(j)); processDataPagesOptimize(distFunction, pr_heaps, ps_heaps, pr, ps); - } else if (minDist.compareTo(pr_knn_distance) <= 0 || minDist.compareTo(ps_knn_distance) <= 0) { + } + else if(minDist.compareTo(pr_knn_distance) <= 0 || minDist.compareTo(ps_knn_distance) <= 0) { pq.add(new Task(minDist, i, j)); } - if (mprogress != null) { + if(mprogress != null) { mprogress.incrementProcessed(LOG); } } } - if (mprogress != null) { + if(mprogress != null) { mprogress.ensureCompleted(LOG); } // Process the queue FiniteProgress qprogress = LOG.isVerbose() ? new FiniteProgress("Processing queue", pq.size(), LOG) : null; IndefiniteProgress fprogress = LOG.isVerbose() ? new IndefiniteProgress("Full comparisons", LOG) : null; - while (!pq.isEmpty()) { + while(!pq.isEmpty()) { Task task = pq.poll(); List<KNNHeap<D>> pr_heaps = heaps.get(task.i); List<KNNHeap<D>> ps_heaps = heaps.get(task.j); @@ -190,30 +191,32 @@ public class KNNJoin<V extends NumberVector<?>, D extends Distance<D>, N extends D ps_knn_distance = computeStopDistance(ps_heaps); boolean dor = task.mindist.compareTo(pr_knn_distance) <= 0; boolean dos = task.mindist.compareTo(ps_knn_distance) <= 0; - if (dor || dos) { + if(dor || dos) { N pr = index.getNode(ps_candidates.get(task.i)); N ps = index.getNode(ps_candidates.get(task.j)); - if (dor && dos) { + if(dor && dos) { processDataPagesOptimize(distFunction, pr_heaps, ps_heaps, pr, ps); - } else { - if (dor) { + } + else { + if(dor) { processDataPagesOptimize(distFunction, pr_heaps, null, pr, ps); - } else /* dos */{ + } + else /* dos */{ processDataPagesOptimize(distFunction, ps_heaps, null, ps, pr); } } - if (fprogress != null) { + if(fprogress != null) { fprogress.incrementProcessed(LOG); } } - if (qprogress != null) { + if(qprogress != null) { qprogress.incrementProcessed(LOG); } } - if (qprogress != null) { + if(qprogress != null) { qprogress.ensureCompleted(LOG); } - if (fprogress != null) { + if(fprogress != null) { fprogress.setCompleted(LOG); } @@ -223,12 +226,12 @@ public class KNNJoin<V extends NumberVector<?>, D extends Distance<D>, N extends // null; FiniteProgress pageprog = LOG.isVerbose() ? new FiniteProgress("Number of processed data pages", ps_candidates.size(), LOG) : null; // int processed = 0; - for (int i = 0; i < ps_candidates.size(); i++) { + for(int i = 0; i < ps_candidates.size(); i++) { N pr = index.getNode(ps_candidates.get(i)); List<KNNHeap<D>> pr_heaps = heaps.get(i); // Finalize lists - for (int j = 0; j < pr.getNumEntries(); j++) { + for(int j = 0; j < pr.getNumEntries(); j++) { knnLists.put(((LeafEntry) pr.getEntry(j)).getDBID(), pr_heaps.get(j).toKNNList()); } // Forget heaps and pq @@ -238,14 +241,14 @@ public class KNNJoin<V extends NumberVector<?>, D extends Distance<D>, N extends // if(progress != null) { // progress.setProcessed(processed, logger); // } - if (pageprog != null) { + if(pageprog != null) { pageprog.incrementProcessed(LOG); } } // if(progress != null) { // progress.ensureCompleted(logger); // } - if (pageprog != null) { + if(pageprog != null) { pageprog.ensureCompleted(LOG); } return knnLists; @@ -261,7 +264,7 @@ public class KNNJoin<V extends NumberVector<?>, D extends Distance<D>, N extends private List<KNNHeap<D>> initHeaps(SpatialPrimitiveDistanceFunction<V, D> distFunction, N pr) { List<KNNHeap<D>> pr_heaps = new ArrayList<>(pr.getNumEntries()); // Create for each data object a knn heap - for (int j = 0; j < pr.getNumEntries(); j++) { + for(int j = 0; j < pr.getNumEntries(); j++) { pr_heaps.add(DBIDUtil.newHeap(distFunction.getDistanceFactory(), k)); } // Self-join first, as this is expected to improve most and cannot be @@ -282,20 +285,21 @@ public class KNNJoin<V extends NumberVector<?>, D extends Distance<D>, N extends */ @SuppressWarnings("unchecked") private void processDataPagesOptimize(SpatialPrimitiveDistanceFunction<V, D> distFunction, List<? extends KNNHeap<D>> pr_heaps, List<? extends KNNHeap<D>> ps_heaps, N pr, N ps) { - if (DistanceUtil.isDoubleDistanceFunction(distFunction)) { + if(DistanceUtil.isDoubleDistanceFunction(distFunction)) { List<?> khp = (List<?>) pr_heaps; List<?> khs = (List<?>) ps_heaps; processDataPagesDouble((SpatialPrimitiveDoubleDistanceFunction<? super V>) distFunction, pr, ps, (List<DoubleDistanceKNNHeap>) khp, (List<DoubleDistanceKNNHeap>) khs); - } else { - for (int j = 0; j < ps.getNumEntries(); j++) { + } + else { + for(int j = 0; j < ps.getNumEntries(); j++) { final SpatialPointLeafEntry s_e = (SpatialPointLeafEntry) ps.getEntry(j); DBID s_id = s_e.getDBID(); - for (int i = 0; i < pr.getNumEntries(); i++) { + for(int i = 0; i < pr.getNumEntries(); i++) { final SpatialPointLeafEntry r_e = (SpatialPointLeafEntry) pr.getEntry(i); D distance = distFunction.minDist(s_e, r_e); - pr_heaps.get(i).add(distance, s_id); - if (pr != ps && ps_heaps != null) { - ps_heaps.get(j).add(distance, r_e.getDBID()); + pr_heaps.get(i).insert(distance, s_id); + if(pr != ps && ps_heaps != null) { + ps_heaps.get(j).insert(distance, r_e.getDBID()); } } } @@ -314,15 +318,15 @@ public class KNNJoin<V extends NumberVector<?>, D extends Distance<D>, N extends */ private void processDataPagesDouble(SpatialPrimitiveDoubleDistanceFunction<? super V> df, N pr, N ps, List<DoubleDistanceKNNHeap> pr_heaps, List<DoubleDistanceKNNHeap> ps_heaps) { // Compare pairwise - for (int j = 0; j < ps.getNumEntries(); j++) { + for(int j = 0; j < ps.getNumEntries(); j++) { final SpatialPointLeafEntry s_e = (SpatialPointLeafEntry) ps.getEntry(j); DBID s_id = s_e.getDBID(); - for (int i = 0; i < pr.getNumEntries(); i++) { + for(int i = 0; i < pr.getNumEntries(); i++) { final SpatialPointLeafEntry r_e = (SpatialPointLeafEntry) pr.getEntry(i); double distance = df.doubleMinDist(s_e, r_e); - pr_heaps.get(i).add(distance, s_id); - if (pr != ps && ps_heaps != null) { - ps_heaps.get(j).add(distance, r_e.getDBID()); + pr_heaps.get(i).insert(distance, s_id); + if(pr != ps && ps_heaps != null) { + ps_heaps.get(j).insert(distance, r_e.getDBID()); } } } @@ -337,15 +341,16 @@ public class KNNJoin<V extends NumberVector<?>, D extends Distance<D>, N extends private D computeStopDistance(List<KNNHeap<D>> heaps) { // Update pruning distance D pr_knn_distance = null; - for (KNNHeap<D> knnList : heaps) { + for(KNNHeap<D> knnList : heaps) { // set kNN distance of r - if (pr_knn_distance == null) { + if(pr_knn_distance == null) { pr_knn_distance = knnList.getKNNDistance(); - } else { + } + else { pr_knn_distance = DistanceUtil.max(knnList.getKNNDistance(), pr_knn_distance); } } - if (pr_knn_distance == null) { + if(pr_knn_distance == null) { return getDistanceFunction().getDistanceFactory().infiniteDistance(); } return pr_knn_distance; @@ -421,8 +426,8 @@ public class KNNJoin<V extends NumberVector<?>, D extends Distance<D>, N extends protected void makeOptions(Parameterization config) { super.makeOptions(config); IntParameter kP = new IntParameter(K_ID, 1); - kP.addConstraint(new GreaterConstraint(0)); - if (config.grab(kP)) { + kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(kP)) { k = kP.getValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/benchmark/ValidateApproximativeKNNIndex.java b/src/de/lmu/ifi/dbs/elki/algorithm/benchmark/ValidateApproximativeKNNIndex.java index 3d0ea52a..8b83b5d4 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/benchmark/ValidateApproximativeKNNIndex.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/benchmark/ValidateApproximativeKNNIndex.java @@ -36,9 +36,9 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.distance.KNNList; import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery; +import de.lmu.ifi.dbs.elki.database.query.LinearScanQuery; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; -import de.lmu.ifi.dbs.elki.database.query.knn.LinearScanKNNQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.datasource.DatabaseConnection; import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; @@ -141,32 +141,35 @@ public class ValidateApproximativeKNNIndex<O, D extends Distance<D>> extends Abs DistanceQuery<O, D> distQuery = database.getDistanceQuery(relation, getDistanceFunction()); // Approximate query: KNNQuery<O, D> knnQuery = database.getKNNQuery(distQuery, k, DatabaseQuery.HINT_OPTIMIZED_ONLY); - if (knnQuery == null || knnQuery instanceof LinearScanKNNQuery) { + if(knnQuery == null || knnQuery instanceof LinearScanQuery) { throw new AbortException("Expected an accelerated query, but got a linear scan -- index is not used."); } // Exact query: KNNQuery<O, D> truekNNQuery; - if (forcelinear) { + if(forcelinear) { truekNNQuery = QueryUtil.getLinearScanKNNQuery(distQuery); - } else { + } + else { truekNNQuery = database.getKNNQuery(distQuery, k, DatabaseQuery.HINT_EXACT); } - if (knnQuery.getClass().equals(truekNNQuery.getClass())) { + if(knnQuery.getClass().equals(truekNNQuery.getClass())) { LOG.warning("Query classes are the same. This experiment may be invalid!"); } // No query set - use original database. - if (queries == null || pattern != null) { + if(queries == null || pattern != null) { // Relation to filter on Relation<String> lrel = (pattern != null) ? DatabaseUtil.guessLabelRepresentation(database) : null; final DBIDs sample; - if (sampling <= 0) { + if(sampling <= 0) { sample = relation.getDBIDs(); - } else if (sampling < 1.1) { + } + else if(sampling < 1.1) { int size = (int) Math.min(sampling * relation.size(), relation.size()); sample = DBIDUtil.randomSample(relation.getDBIDs(), size, random); - } else { + } + else { int size = (int) Math.min(sampling, relation.size()); sample = DBIDUtil.randomSample(relation.getDBIDs(), size, random); } @@ -174,8 +177,8 @@ public class ValidateApproximativeKNNIndex<O, D extends Distance<D>> extends Abs MeanVariance mv = new MeanVariance(), mvrec = new MeanVariance(); MeanVariance mvdist = new MeanVariance(), mvdaerr = new MeanVariance(), mvdrerr = new MeanVariance(); int misses = 0; - for (DBIDIter iditer = sample.iter(); iditer.valid(); iditer.advance()) { - if (pattern == null || pattern.matcher(lrel.get(iditer)).find()) { + for(DBIDIter iditer = sample.iter(); iditer.valid(); iditer.advance()) { + if(pattern == null || pattern.matcher(lrel.get(iditer)).find()) { // Query index: KNNList<D> knns = knnQuery.getKNNForDBID(iditer, k); // Query reference: @@ -187,53 +190,55 @@ public class ValidateApproximativeKNNIndex<O, D extends Distance<D>> extends Abs // Put recall: mvrec.put(DBIDUtil.intersectionSize(knns, trueknns) / trueknns.size()); - if (knns.size() >= k) { + if(knns.size() >= k) { D kdist = knns.getKNNDistance(); - if (kdist instanceof NumberDistance) { + if(kdist instanceof NumberDistance) { final double dist = ((NumberDistance<?, ?>) kdist).doubleValue(); final double tdist = ((NumberDistance<?, ?>) trueknns.getKNNDistance()).doubleValue(); - if (tdist > 0.0) { + if(tdist > 0.0) { mvdist.put(dist); mvdaerr.put(dist - tdist); mvdrerr.put(dist / tdist); } } - } else { + } + else { // Less than k objects. misses++; } } - if (prog != null) { + if(prog != null) { prog.incrementProcessed(LOG); } } - if (prog != null) { + if(prog != null) { prog.ensureCompleted(LOG); } - if (LOG.isStatistics()) { + if(LOG.isStatistics()) { LOG.statistics("Mean number of results: " + mv.getMean() + " +- " + mv.getNaiveStddev()); LOG.statistics("Recall of true results: " + mvrec.getMean() + " +- " + mvrec.getNaiveStddev()); - if (mvdist.getCount() > 0) { + if(mvdist.getCount() > 0) { LOG.statistics("Mean k-distance: " + mvdist.getMean() + " +- " + mvdist.getNaiveStddev()); LOG.statistics("Mean absolute k-error: " + mvdaerr.getMean() + " +- " + mvdaerr.getNaiveStddev()); LOG.statistics("Mean relative k-error: " + mvdrerr.getMean() + " +- " + mvdrerr.getNaiveStddev()); } - if (misses > 0) { + if(misses > 0) { LOG.statistics(String.format("Number of queries that returned less than k=%d objects: %d (%.2f%%)", k, misses, misses * 100. / mv.getCount())); } } - } else { + } + else { // Separate query set. TypeInformation res = getDistanceFunction().getInputTypeRestriction(); MultipleObjectsBundle bundle = queries.loadData(); int col = -1; - for (int i = 0; i < bundle.metaLength(); i++) { - if (res.isAssignableFromType(bundle.meta(i))) { + for(int i = 0; i < bundle.metaLength(); i++) { + if(res.isAssignableFromType(bundle.meta(i))) { col = i; break; } } - if (col < 0) { + if(col < 0) { throw new AbortException("No compatible data type in query input was found. Expected: " + res.toString()); } // Random sampling is a bit of hack, sorry. @@ -241,12 +246,14 @@ public class ValidateApproximativeKNNIndex<O, D extends Distance<D>> extends Abs DBIDRange sids = DBIDUtil.generateStaticDBIDRange(bundle.dataLength()); final DBIDs sample; - if (sampling <= 0) { + if(sampling <= 0) { sample = sids; - } else if (sampling < 1.1) { + } + else if(sampling < 1.1) { int size = (int) Math.min(sampling * relation.size(), relation.size()); sample = DBIDUtil.randomSample(sids, size, random); - } else { + } + else { int size = (int) Math.min(sampling, sids.size()); sample = DBIDUtil.randomSample(sids, size, random); } @@ -254,7 +261,7 @@ public class ValidateApproximativeKNNIndex<O, D extends Distance<D>> extends Abs MeanVariance mv = new MeanVariance(), mvrec = new MeanVariance(); MeanVariance mvdist = new MeanVariance(), mvdaerr = new MeanVariance(), mvdrerr = new MeanVariance(); int misses = 0; - for (DBIDIter iditer = sample.iter(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = sample.iter(); iditer.valid(); iditer.advance()) { int off = sids.binarySearch(iditer); assert (off >= 0); @SuppressWarnings("unchecked") @@ -271,36 +278,37 @@ public class ValidateApproximativeKNNIndex<O, D extends Distance<D>> extends Abs // Put recall: mvrec.put(DBIDUtil.intersectionSize(knns, trueknns) / trueknns.size()); - if (knns.size() >= k) { + if(knns.size() >= k) { D kdist = knns.getKNNDistance(); - if (kdist instanceof NumberDistance) { + if(kdist instanceof NumberDistance) { final double dist = ((NumberDistance<?, ?>) kdist).doubleValue(); final double tdist = ((NumberDistance<?, ?>) trueknns.getKNNDistance()).doubleValue(); - if (tdist > 0.0) { + if(tdist > 0.0) { mvdist.put(dist); mvdaerr.put(dist - tdist); mvdrerr.put(dist / tdist); } } - } else { + } + else { // Less than k objects. misses++; } - if (prog != null) { + if(prog != null) { prog.incrementProcessed(LOG); } } - if (prog != null) { + if(prog != null) { prog.ensureCompleted(LOG); } - if (LOG.isStatistics()) { + if(LOG.isStatistics()) { LOG.statistics("Mean number of results: " + mv.getMean() + " +- " + mv.getNaiveStddev()); LOG.statistics("Recall of true results: " + mvrec.getMean() + " +- " + mvrec.getNaiveStddev()); - if (mvdist.getCount() > 0) { + if(mvdist.getCount() > 0) { LOG.statistics("Mean absolute k-error: " + mvdaerr.getMean() + " +- " + mvdaerr.getNaiveStddev()); LOG.statistics("Mean relative k-error: " + mvdrerr.getMean() + " +- " + mvdrerr.getNaiveStddev()); } - if (misses > 0) { + if(misses > 0) { LOG.statistics(String.format("Number of queries that returned less than k=%d objects: %d (%.2f%%)", k, misses, misses * 100. / mv.getCount())); } } @@ -393,31 +401,32 @@ public class ValidateApproximativeKNNIndex<O, D extends Distance<D>> extends Abs protected void makeOptions(Parameterization config) { super.makeOptions(config); IntParameter kP = new IntParameter(K_ID); - if (config.grab(kP)) { + if(config.grab(kP)) { k = kP.intValue(); } PatternParameter patternP = new PatternParameter(PATTERN_ID); patternP.setOptional(true); - if (config.grab(patternP)) { + if(config.grab(patternP)) { pattern = patternP.getValue(); - } else { + } + else { ObjectParameter<DatabaseConnection> queryP = new ObjectParameter<>(QUERY_ID, DatabaseConnection.class); queryP.setOptional(true); - if (config.grab(queryP)) { + if(config.grab(queryP)) { queries = queryP.instantiateClass(config); } } DoubleParameter samplingP = new DoubleParameter(SAMPLING_ID); samplingP.setOptional(true); - if (config.grab(samplingP)) { + if(config.grab(samplingP)) { sampling = samplingP.doubleValue(); } Flag forceP = new Flag(FORCE_ID); - if (config.grab(forceP)) { + if(config.grab(forceP)) { forcelinear = forceP.isTrue(); } RandomParameter randomP = new RandomParameter(RANDOM_ID, RandomFactory.DEFAULT); - if (config.grab(randomP)) { + if(config.grab(randomP)) { random = randomP.getValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java index 0c4eb5fc..96c95a9f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java @@ -35,7 +35,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistance import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -152,8 +152,8 @@ public abstract class AbstractProjectedClustering<R extends Clustering<?>, V ext */ protected void configK(Parameterization config) { IntParameter kP = new IntParameter(K_ID); - kP.addConstraint(new GreaterConstraint(0)); - if (config.grab(kP)) { + kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(kP)) { k = kP.getValue(); } } @@ -165,8 +165,8 @@ public abstract class AbstractProjectedClustering<R extends Clustering<?>, V ext */ protected void configKI(Parameterization config) { IntParameter k_iP = new IntParameter(K_I_ID, 30); - k_iP.addConstraint(new GreaterConstraint(0)); - if (config.grab(k_iP)) { + k_iP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(k_iP)) { k_i = k_iP.getValue(); } } @@ -178,8 +178,8 @@ public abstract class AbstractProjectedClustering<R extends Clustering<?>, V ext */ protected void configL(Parameterization config) { IntParameter lP = new IntParameter(L_ID); - lP.addConstraint(new GreaterConstraint(0)); - if (config.grab(lP)) { + lP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(lP)) { l = lP.getValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java index ee3b234c..52e37197 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java @@ -55,7 +55,7 @@ import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; @@ -294,7 +294,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext // try to expand the cluster ModifiableDBIDs currentCluster = DBIDUtil.newArray(); ModifiableDBIDs seeds = DBIDUtil.newHashSet(); - for (DistanceDBIDListIter<DoubleDistance> seed = neighbors.iter(); seed.valid(); seed.advance()) { + for(DistanceDBIDListIter<DoubleDistance> seed = neighbors.iter(); seed.valid(); seed.advance()) { int nextID_corrDim = distFunc.getIndex().getLocalProjection(seed).getCorrelationDimension(); // nextID is not reachable from start object if(nextID_corrDim > lambda) { @@ -322,9 +322,9 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext DistanceDBIDList<DoubleDistance> reachables = rangeQuery.getRangeForDBID(iter, epsilon); iter.remove(); - + if(reachables.size() > minpts) { - for (DistanceDBIDListIter<DoubleDistance> r = reachables.iter(); r.valid(); r.advance()) { + for(DistanceDBIDListIter<DoubleDistance> r = reachables.iter(); r.valid(); r.advance()) { int corrDim_r = distFunc.getIndex().getLocalProjection(r).getCorrelationDimension(); // r is not reachable from q if(corrDim_r > lambda) { @@ -351,9 +351,10 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext } } - /* if(processedIDs.size() == relation.size() && noise.size() == 0) { - break; - } */ + /* + * if(processedIDs.size() == relation.size() && noise.size() == 0) { + * break; } + */ } if(currentCluster.size() >= minpts) { @@ -375,7 +376,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(distanceFunction.getInputTypeRestriction()); } - + /** * Parameterization class. * @@ -411,7 +412,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext protected void configMinPts(Parameterization config) { IntParameter minptsP = new IntParameter(MINPTS_ID); - minptsP.addConstraint(new GreaterConstraint(0)); + minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(minptsP)) { minpts = minptsP.getValue(); } @@ -435,7 +436,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext protected void configLambda(Parameterization config) { IntParameter lambdaP = new IntParameter(LAMBDA_ID); - lambdaP.addConstraint(new GreaterConstraint(0)); + lambdaP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(lambdaP)) { lambda = lambdaP.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java index 57dcb435..09c78fec 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java @@ -38,9 +38,8 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDMIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; -import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList; import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; @@ -52,7 +51,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -82,24 +81,12 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor private static final Logging LOG = Logging.getLogger(DBSCAN.class); /** - * Parameter to specify the maximum radius of the neighborhood to be - * considered, must be suitable to the distance function specified. + * Holds the epsilon radius threshold. */ - public static final OptionID EPSILON_ID = new OptionID("dbscan.epsilon", "The maximum radius of the neighborhood to be considered."); + protected D epsilon; /** - * Holds the value of {@link #EPSILON_ID}. - */ - private D epsilon; - - /** - * Parameter to specify the threshold for minimum number of points in the - * epsilon-neighborhood of a point, must be an integer greater than 0. - */ - public static final OptionID MINPTS_ID = new OptionID("dbscan.minpts", "Threshold for minimum number of points in the epsilon-neighborhood of a point."); - - /** - * Holds the value of {@link #MINPTS_ID}. + * Holds the minimum cluster size. */ protected int minpts; @@ -146,7 +133,9 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor if(size < minpts) { // The can't be any clusters noise.addDBIDs(relation.getDBIDs()); - objprog.setProcessed(noise.size(), LOG); + if(objprog != null) { + objprog.setProcessed(noise.size(), LOG); + } } else { for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { @@ -193,7 +182,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor * @param objprog the progress object for logging the current status */ protected void expandCluster(Relation<O> relation, RangeQuery<O, D> rangeQuery, DBIDRef startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) { - DistanceDBIDList<D> neighbors = rangeQuery.getRangeForDBID(startObjectID, epsilon); + DBIDs neighbors = rangeQuery.getRangeForDBID(startObjectID, epsilon); // startObject is no core-object if(neighbors.size() < minpts) { @@ -207,7 +196,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor } // try to expand the cluster - HashSetModifiableDBIDs seeds = DBIDUtil.newHashSet(); + ModifiableDBIDs seeds = DBIDUtil.newHashSet(); ModifiableDBIDs currentCluster = DBIDUtil.newArray(); for(DBIDIter seed = neighbors.iter(); seed.valid(); seed.advance()) { if(!processedIDs.contains(seed)) { @@ -222,9 +211,9 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor } seeds.remove(startObjectID); - while(seeds.size() > 0) { + while(!seeds.isEmpty()) { DBIDMIter o = seeds.iter(); - DistanceDBIDList<D> neighborhood = rangeQuery.getRangeForDBID(o, epsilon); + DBIDs neighborhood = rangeQuery.getRangeForDBID(o, epsilon); o.remove(); if(neighborhood.size() >= minpts) { @@ -282,6 +271,18 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor * @apiviz.exclude */ public static class Parameterizer<O, D extends Distance<D>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> { + /** + * Parameter to specify the maximum radius of the neighborhood to be + * considered, must be suitable to the distance function specified. + */ + public static final OptionID EPSILON_ID = new OptionID("dbscan.epsilon", "The maximum radius of the neighborhood to be considered."); + + /** + * Parameter to specify the threshold for minimum number of points in the + * epsilon-neighborhood of a point, must be an integer greater than 0. + */ + public static final OptionID MINPTS_ID = new OptionID("dbscan.minpts", "Threshold for minimum number of points in the epsilon-neighborhood of a point."); + protected D epsilon = null; protected int minpts = 0; @@ -295,7 +296,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor } IntParameter minptsP = new IntParameter(MINPTS_ID); - minptsP.addConstraint(new GreaterConstraint(0)); + minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(minptsP)) { minpts = minptsP.getValue(); } @@ -306,4 +307,4 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor return new DBSCAN<>(distanceFunction, epsilon, minpts); } } -}
\ No newline at end of file +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java index 3c2e0278..814b4cc4 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java @@ -62,7 +62,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -496,7 +496,7 @@ public class DeLiClu<NV extends NumberVector<?>, D extends Distance<D>> extends protected void makeOptions(Parameterization config) { super.makeOptions(config); IntParameter minptsP = new IntParameter(MINPTS_ID); - minptsP.addConstraint(new GreaterConstraint(0)); + minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(minptsP)) { minpts = minptsP.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java index c66442a1..e82ec674 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java @@ -34,6 +34,7 @@ import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.model.EMModel; +import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; @@ -41,14 +42,15 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; -import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.MathUtil; +import de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix; import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.FormatUtil; @@ -57,8 +59,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -72,8 +73,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * zero-covariance and variance=1 in covariance matrices. * </p> * <p> - * Reference: A. P. Dempster, N. M. Laird, D. B. Rubin: Maximum Likelihood from - * Incomplete Data via the EM algorithm. <br> + * Reference: A. P. Dempster, N. M. Laird, D. B. Rubin:<br /> + * Maximum Likelihood from Incomplete Data via the EM algorithm.<br> * In Journal of the Royal Statistical Society, Series B, 39(1), 1977, pp. 1-31 * </p> * @@ -100,48 +101,36 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< private static final double SINGULARITY_CHEAT = 1E-9; /** - * Parameter to specify the number of clusters to find, must be an integer - * greater than 0. - */ - public static final OptionID K_ID = new OptionID("em.k", "The number of clusters to find."); - - /** - * Holds the value of {@link #K_ID}. + * Number of clusters */ private int k; /** - * Parameter to specify the termination criterion for maximization of E(M): - * E(M) - E(M') < em.delta, must be a double equal to or greater than 0. + * Delta parameter */ - public static final OptionID DELTA_ID = new OptionID("em.delta", "The termination criterion for maximization of E(M): " + "E(M) - E(M') < em.delta"); + private double delta; /** - * Parameter to specify the initialization method + * Class to choose the initial means */ - public static final OptionID INIT_ID = new OptionID("kmeans.initialization", "Method to choose the initial means."); - - private static final double MIN_LOGLIKELIHOOD = -100000; + private KMeansInitialization<V> initializer; /** - * Holds the value of {@link #DELTA_ID}. + * Maximum number of iterations to allow */ - private double delta; + private int maxiter; /** - * Store the individual probabilities, for use by EMOutlierDetection etc. + * Retain soft assignments. */ - private WritableDataStore<double[]> probClusterIGivenX; + private boolean soft; - /** - * Class to choose the initial means - */ - private KMeansInitialization<V> initializer; + private static final double MIN_LOGLIKELIHOOD = -100000; /** - * Maximum number of iterations to allow + * Soft assignment result type. */ - private int maxiter; + public static final SimpleTypeInformation<double[]> SOFT_TYPE = new SimpleTypeInformation<>(double[].class); /** * Constructor. @@ -150,13 +139,15 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< * @param delta delta parameter * @param initializer Class to choose the initial means * @param maxiter Maximum number of iterations + * @param soft Include soft assignments */ - public EM(int k, double delta, KMeansInitialization<V> initializer, int maxiter) { + public EM(int k, double delta, KMeansInitialization<V> initializer, int maxiter, boolean soft) { super(); this.k = k; this.delta = delta; this.initializer = initializer; this.maxiter = maxiter; + this.setSoft(soft); } /** @@ -172,137 +163,80 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< * @return Result */ public Clustering<EMModel<V>> run(Database database, Relation<V> relation) { - if (relation.size() == 0) { + if(relation.size() == 0) { throw new IllegalArgumentException("database empty: must contain elements"); } // initial models - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("initializing " + k + " models"); } - List<Vector> means = new ArrayList<>(); - for (NumberVector<?> nv : initializer.chooseInitialMeans(database, relation, k, EuclideanDistanceFunction.STATIC)) { - means.add(nv.getColumnVector()); + final List<V> initialMeans = initializer.chooseInitialMeans(database, relation, k, EuclideanDistanceFunction.STATIC); + assert (initialMeans.size() == k); + Vector[] means = new Vector[k]; + { + int i = 0; + for(NumberVector<?> nv : initialMeans) { + means[i] = nv.getColumnVector(); + i++; + } } - List<Matrix> covarianceMatrices = new ArrayList<>(k); + Matrix[] covarianceMatrices = new Matrix[k]; double[] normDistrFactor = new double[k]; - List<Matrix> invCovMatr = new ArrayList<>(k); + Matrix[] invCovMatr = new Matrix[k]; double[] clusterWeights = new double[k]; - probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class); + WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class); - final int dimensionality = means.get(0).getDimensionality(); - for (int i = 0; i < k; i++) { + final int dimensionality = means[0].getDimensionality(); + final double norm = MathUtil.powi(MathUtil.TWOPI, dimensionality); + for(int i = 0; i < k; i++) { Matrix m = Matrix.identity(dimensionality, dimensionality); - covarianceMatrices.add(m); - final double det = m.det(); - if (det > 0.) { - normDistrFactor[i] = 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * det); - } else { - LOG.warning("Encountered matrix with 0 determinant - degenerated."); - normDistrFactor[i] = 1.0; // Not really well defined - } - invCovMatr.add(m.inverse()); + covarianceMatrices[i] = m; + normDistrFactor[i] = 1.0 / Math.sqrt(norm); + invCovMatr[i] = Matrix.identity(dimensionality, dimensionality); clusterWeights[i] = 1.0 / k; - if (LOG.isDebuggingFinest()) { - StringBuilder msg = new StringBuilder(); - msg.append(" model ").append(i).append(":\n"); - msg.append(" mean: ").append(means.get(i)).append('\n'); - msg.append(" m:\n").append(FormatUtil.format(m, " ")).append('\n'); - msg.append(" m.det(): ").append(det).append('\n'); - msg.append(" cluster weight: ").append(clusterWeights[i]).append('\n'); - msg.append(" normDistFact: ").append(normDistrFactor[i]).append('\n'); - LOG.debugFine(msg.toString()); - } } double emNew = assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX); // iteration unless no change - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("iterating EM"); } - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("iteration " + 0 + " - expectation value: " + emNew); } - double em; - for (int it = 1; it <= maxiter || maxiter < 0; it++) { - em = emNew; - - // recompute models - List<Vector> meanSums = new ArrayList<>(k); - double[] sumOfClusterProbabilities = new double[k]; - - for (int i = 0; i < k; i++) { - clusterWeights[i] = 0.0; - meanSums.add(new Vector(dimensionality)); - covarianceMatrices.set(i, Matrix.zeroMatrix(dimensionality)); - } - - // weights and means - for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { - double[] clusterProbabilities = probClusterIGivenX.get(iditer); - - for (int i = 0; i < k; i++) { - sumOfClusterProbabilities[i] += clusterProbabilities[i]; - Vector summand = relation.get(iditer).getColumnVector().timesEquals(clusterProbabilities[i]); - meanSums.get(i).plusEquals(summand); - } - } - final int n = relation.size(); - for (int i = 0; i < k; i++) { - clusterWeights[i] = sumOfClusterProbabilities[i] / n; - Vector newMean = meanSums.get(i).timesEquals(1 / sumOfClusterProbabilities[i]); - means.set(i, newMean); - } - // covariance matrices - for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { - double[] clusterProbabilities = probClusterIGivenX.get(iditer); - Vector instance = relation.get(iditer).getColumnVector(); - for (int i = 0; i < k; i++) { - Vector difference = instance.minus(means.get(i)); - covarianceMatrices.get(i).plusEquals(difference.timesTranspose(difference).timesEquals(clusterProbabilities[i])); - } - } - for (int i = 0; i < k; i++) { - covarianceMatrices.set(i, covarianceMatrices.get(i).times(1 / sumOfClusterProbabilities[i]).cheatToAvoidSingularity(SINGULARITY_CHEAT)); - } - for (int i = 0; i < k; i++) { - final double det = covarianceMatrices.get(i).det(); - if (det > 0.) { - normDistrFactor[i] = 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * det); - } else { - LOG.warning("Encountered matrix with 0 determinant - degenerated."); - normDistrFactor[i] = 1.0; // Not really well defined - } - invCovMatr.set(i, covarianceMatrices.get(i).inverse()); - } + for(int it = 1; it <= maxiter || maxiter < 0; it++) { + final double emOld = emNew; + recomputeCovarianceMatrices(relation, probClusterIGivenX, means, covarianceMatrices, dimensionality); + computeInverseMatrixes(covarianceMatrices, invCovMatr, normDistrFactor, norm); // reassign probabilities emNew = assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX); - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("iteration " + it + " - expectation value: " + emNew); } - if (Math.abs(em - emNew) <= delta) { + if(Math.abs(emOld - emNew) <= delta) { break; } } - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("assigning clusters"); } // fill result with clusters and models List<ModifiableDBIDs> hardClusters = new ArrayList<>(k); - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { hardClusters.add(DBIDUtil.newHashSet()); } // provide a hard clustering - for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double[] clusterProbabilities = probClusterIGivenX.get(iditer); int maxIndex = 0; double currentMax = 0.0; - for (int i = 0; i < k; i++) { - if (clusterProbabilities[i] > currentMax) { + for(int i = 0; i < k; i++) { + if(clusterProbabilities[i] > currentMax) { maxIndex = i; currentMax = clusterProbabilities[i]; } @@ -312,24 +246,89 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); Clustering<EMModel<V>> result = new Clustering<>("EM Clustering", "em-clustering"); // provide models within the result - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { // TODO: re-do labeling. // SimpleClassLabel label = new SimpleClassLabel(); // label.init(result.canonicalClusterLabel(i)); - Cluster<EMModel<V>> model = new Cluster<>(hardClusters.get(i), new EMModel<>(factory.newNumberVector(means.get(i).getArrayRef()), covarianceMatrices.get(i))); + Cluster<EMModel<V>> model = new Cluster<>(hardClusters.get(i), new EMModel<>(factory.newNumberVector(means[i].getArrayRef()), covarianceMatrices[i])); result.addToplevelCluster(model); } + if(isSoft()) { + result.addChildResult(new MaterializedRelation<>("cluster assignments", "em-soft-score", SOFT_TYPE, probClusterIGivenX, relation.getDBIDs())); + } + else { + probClusterIGivenX.destroy(); + } return result; } /** + * Compute the inverse cluster matrices. + * + * @param covarianceMatrices Input covariance matrices + * @param invCovMatr Output array for inverse matrices + * @param normDistrFactor Output array for norm distribution factors. + * @param norm Normalization factor, usually (2pi)^d + */ + public static void computeInverseMatrixes(Matrix[] covarianceMatrices, Matrix[] invCovMatr, double[] normDistrFactor, final double norm) { + int k = covarianceMatrices.length; + for(int i = 0; i < k; i++) { + final double det = covarianceMatrices[i].det(); + if(det > 0.) { + normDistrFactor[i] = 1. / Math.sqrt(norm * det); + } + else { + LOG.warning("Encountered matrix with 0 determinant - degenerated."); + normDistrFactor[i] = 1.; // Not really well defined + } + invCovMatr[i] = covarianceMatrices[i].inverse(); + } + } + + /** + * Recompute the covariance matrixes. + * + * @param relation Vector data + * @param probClusterIGivenX Object probabilities + * @param means Cluster means output + * @param covarianceMatrices Output covariance matrixes + * @param dimensionality Data set dimensionality + */ + public static void recomputeCovarianceMatrices(Relation<? extends NumberVector<?>> relation, WritableDataStore<double[]> probClusterIGivenX, Vector[] means, Matrix[] covarianceMatrices, final int dimensionality) { + final int k = means.length; + CovarianceMatrix[] cms = new CovarianceMatrix[k]; + for(int i = 0; i < k; i++) { + cms[i] = new CovarianceMatrix(dimensionality); + } + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + double[] clusterProbabilities = probClusterIGivenX.get(iditer); + Vector instance = relation.get(iditer).getColumnVector(); + for(int i = 0; i < k; i++) { + if(clusterProbabilities[i] > 0.) { + cms[i].put(instance, clusterProbabilities[i]); + } + } + } + for(int i = 0; i < k; i++) { + if(cms[i].getWeight() <= 0.) { + means[i] = new Vector(dimensionality); + covarianceMatrices[i] = Matrix.identity(dimensionality, dimensionality); + } + else { + means[i] = cms[i].getMeanVector(); + covarianceMatrices[i] = cms[i].destroyToNaiveMatrix().cheatToAvoidSingularity(SINGULARITY_CHEAT); + } + } + } + + /** * Assigns the current probability values to the instances in the database and * compute the expectation value of the current mixture of distributions. * * Computed as the sum of the logarithms of the prior probability of each * instance. * - * @param database the database used for assignment to instances + * @param relation the database used for assignment to instances * @param normDistrFactor normalization factor for density function, based on * current covariance matrix * @param means the current means @@ -337,58 +336,55 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< * @param clusterWeights the weights of the current clusters * @return the expectation value of the current mixture of distributions */ - protected double assignProbabilitiesToInstances(Relation<V> database, double[] normDistrFactor, List<Vector> means, List<Matrix> invCovMatr, double[] clusterWeights, WritableDataStore<double[]> probClusterIGivenX) { - double emSum = 0.0; + public static double assignProbabilitiesToInstances(Relation<? extends NumberVector<?>> relation, double[] normDistrFactor, Vector[] means, Matrix[] invCovMatr, double[] clusterWeights, WritableDataStore<double[]> probClusterIGivenX) { + final int k = clusterWeights.length; + double emSum = 0.; - for (DBIDIter iditer = database.iterDBIDs(); iditer.valid(); iditer.advance()) { - Vector x = database.get(iditer).getColumnVector(); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + Vector x = relation.get(iditer).getColumnVector(); double[] probabilities = new double[k]; - for (int i = 0; i < k; i++) { - Vector difference = x.minus(means.get(i)); - double rowTimesCovTimesCol = difference.transposeTimesTimes(invCovMatr.get(i), difference); - double power = rowTimesCovTimesCol / 2.0; + for(int i = 0; i < k; i++) { + Vector difference = x.minus(means[i]); + double rowTimesCovTimesCol = difference.transposeTimesTimes(invCovMatr[i], difference); + double power = rowTimesCovTimesCol / 2.; double prob = normDistrFactor[i] * Math.exp(-power); - if (LOG.isDebuggingFinest()) { - LOG.debugFinest(" difference vector= ( " + difference.toString() + " )\n" + " difference:\n" + FormatUtil.format(difference, " ") + "\n" + " rowTimesCovTimesCol:\n" + rowTimesCovTimesCol + "\n" + " power= " + power + "\n" + " prob=" + prob + "\n" + " inv cov matrix: \n" + FormatUtil.format(invCovMatr.get(i), " ")); + if(LOG.isDebuggingFinest()) { + LOG.debugFinest(" difference vector= ( " + difference.toString() + " )\n" + // + " difference:\n" + FormatUtil.format(difference, " ") + "\n" + // + " rowTimesCovTimesCol:\n" + rowTimesCovTimesCol + "\n" + // + " power= " + power + "\n" + " prob=" + prob + "\n" + // + " inv cov matrix: \n" + FormatUtil.format(invCovMatr[i], " ")); } - if (!(prob >= 0.)) { + if(!(prob >= 0.)) { LOG.warning("Invalid probability: " + prob + " power: " + power + " factor: " + normDistrFactor[i]); + prob = 0.; } probabilities[i] = prob; } - double priorProbability = 0.0; - for (int i = 0; i < k; i++) { + double priorProbability = 0.; + for(int i = 0; i < k; i++) { priorProbability += probabilities[i] * clusterWeights[i]; } double logP = Math.max(Math.log(priorProbability), MIN_LOGLIKELIHOOD); - if (!Double.isNaN(logP)) { + if(!Double.isNaN(logP)) { emSum += logP; } double[] clusterProbabilities = new double[k]; - for (int i = 0; i < k; i++) { - assert (clusterWeights[i] >= 0.0); + for(int i = 0; i < k; i++) { + assert (clusterWeights[i] >= 0.); // do not divide by zero! - if (priorProbability > 0.0) { + if(priorProbability > 0.) { clusterProbabilities[i] = probabilities[i] / priorProbability * clusterWeights[i]; - } else { - clusterProbabilities[i] = 0.0; + } + else { + clusterProbabilities[i] = 0.; } } probClusterIGivenX.put(iditer, clusterProbabilities); } - return emSum; - } - - /** - * Get the probabilities for a given point. - * - * @param index Point ID - * @return Probabilities of given point - */ - public double[] getProbClusterIGivenX(DBIDRef index) { - return probClusterIGivenX.get(index); + return emSum / relation.size(); } @Override @@ -402,6 +398,20 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< } /** + * @return the soft + */ + public boolean isSoft() { + return soft; + } + + /** + * @param soft the soft to set + */ + public void setSoft(boolean soft) { + this.soft = soft; + } + + /** * Parameterization class. * * @author Erich Schubert @@ -409,45 +419,77 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< * @apiviz.exclude */ public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer { + /** + * Parameter to specify the number of clusters to find, must be an integer + * greater than 0. + */ + public static final OptionID K_ID = new OptionID("em.k", "The number of clusters to find."); + + /** + * Parameter to specify the termination criterion for maximization of E(M): + * E(M) - E(M') < em.delta, must be a double equal to or greater than 0. + */ + public static final OptionID DELTA_ID = new OptionID("em.delta", // + "The termination criterion for maximization of E(M): " + // + "E(M) - E(M') < em.delta"); + + /** + * Parameter to specify the initialization method + */ + public static final OptionID INIT_ID = new OptionID("kmeans.initialization", // + "Method to choose the initial means."); + + /** + * Number of clusters. + */ protected int k; + /** + * Stopping threshold + */ protected double delta; + /** + * Initialization method + */ protected KMeansInitialization<V> initializer; + /** + * Maximum number of iterations. + */ protected int maxiter = -1; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); IntParameter kP = new IntParameter(K_ID); - kP.addConstraint(new GreaterConstraint(0)); - if (config.grab(kP)) { + kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(kP)) { k = kP.getValue(); } ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class); - if (config.grab(initialP)) { + if(config.grab(initialP)) { initializer = initialP.instantiateClass(config); } DoubleParameter deltaP = new DoubleParameter(DELTA_ID, 0.0); - deltaP.addConstraint(new GreaterEqualConstraint(0.0)); - if (config.grab(deltaP)) { + deltaP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE); + if(config.grab(deltaP)) { delta = deltaP.getValue(); } IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID); - maxiterP.addConstraint(new GreaterEqualConstraint(0)); + maxiterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT); maxiterP.setOptional(true); - if (config.grab(maxiterP)) { + if(config.grab(maxiterP)) { maxiter = maxiterP.getValue(); } } @Override protected EM<V> makeInstance() { - return new EM<>(k, delta, initializer, maxiter); + return new EM<>(k, delta, initializer, maxiter, false); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java index e928d041..a4a922df 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java @@ -33,10 +33,10 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList; -import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDPair; import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDListIter; -import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceDBIDPair; +import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDPair; import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceDBIDListIter; +import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceDBIDPair; import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.DistanceUtil; @@ -55,7 +55,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -146,7 +146,8 @@ public class OPTICS<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor // boxing/unboxing. for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { if(!processedIDs.contains(iditer)) { - // We need to do some ugly casts to be able to run the optimized version, unfortunately. + // We need to do some ugly casts to be able to run the optimized + // version, unfortunately. @SuppressWarnings("unchecked") final ClusterOrderResult<DoubleDistance> doubleClusterOrder = ClusterOrderResult.class.cast(clusterOrder); @SuppressWarnings("unchecked") @@ -304,7 +305,7 @@ public class OPTICS<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor } IntParameter minptsP = new IntParameter(MINPTS_ID); - minptsP.addConstraint(new GreaterConstraint(0)); + minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(minptsP)) { minpts = minptsP.intValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java index 583d402b..db343f3a 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java @@ -48,8 +48,7 @@ import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderEntry; import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderResult; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ClassParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; @@ -240,6 +239,10 @@ public class OPTICSXi<N extends NumberDistance<N, ?>> extends AbstractAlgorithm< // By default, clusters cover both the steep up and steep down area int cstart = sda.getStartIndex(); int cend = sua.getEndIndex(); + // Hotfix: never include infinity-reachable points at the end + while(cend > cstart && Double.isInfinite(clusterOrder.get(cend).getReachability().doubleValue())) { + --cend; + } // However, we sometimes have to adjust this (Condition 4): { // Case b) @@ -654,8 +657,8 @@ public class OPTICSXi<N extends NumberDistance<N, ?>> extends AbstractAlgorithm< protected void makeOptions(Parameterization config) { super.makeOptions(config); DoubleParameter xiP = new DoubleParameter(XI_ID); - xiP.addConstraint(new GreaterEqualConstraint(0.0)); - xiP.addConstraint(new LessConstraint(1.0)); + xiP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE); + xiP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE); if(config.grab(xiP)) { xi = xiP.doubleValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java index 95d9f23c..86bb9a09 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java @@ -53,7 +53,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -328,7 +328,7 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple } IntParameter minptsP = new IntParameter(MINPTS_ID); - minptsP.addConstraint(new GreaterConstraint(0)); + minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(minptsP)) { minpts = minptsP.intValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationClusteringAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationClusteringAlgorithm.java new file mode 100644 index 00000000..68dacf34 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationClusteringAlgorithm.java @@ -0,0 +1,350 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.affinitypropagation; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import gnu.trove.iterator.TIntObjectIterator; +import gnu.trove.map.hash.TIntObjectHashMap; +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.model.MedoidModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; +import de.lmu.ifi.dbs.elki.logging.progress.MutableProgress; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Cluster analysis by affinity propagation. + * + * Reference: + * <p> + * Clustering by Passing Messages Between Data Points<br /> + * B. J. Frey and D. Dueck<br /> + * Science Vol 315 + * </p> + * + * @author Erich Schubert + * + * @apiviz.composedOf AffinityPropagationInitialization + * + * @param <O> object type + */ +@Title("Affinity Propagation: Clustering by Passing Messages Between Data Points") +@Reference(title = "Clustering by Passing Messages Between Data Points", authors = "B. J. Frey and D. Dueck", booktitle = "Science Vol 315", url = "http://dx.doi.org/10.1126/science.1136800") +public class AffinityPropagationClusteringAlgorithm<O> extends AbstractAlgorithm<Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> { + /** + * Class logger + */ + private static final Logging LOG = Logging.getLogger(AffinityPropagationClusteringAlgorithm.class); + + /** + * Similarity initialization + */ + AffinityPropagationInitialization<O> initialization; + + /** + * Damping factor lambda. + */ + double lambda = 0.5; + + /** + * Terminate after 10 iterations with no changes. + */ + int convergence = 10; + + /** + * Maximum number of iterations. + */ + int maxiter = 1000; + + /** + * Constructor. + * + * @param initialization Similarity initialization + * @param lambda Damping factor + * @param convergence Termination threshold (Number of stable iterations) + * @param maxiter Maximum number of iterations + */ + public AffinityPropagationClusteringAlgorithm(AffinityPropagationInitialization<O> initialization, double lambda, int convergence, int maxiter) { + super(); + this.initialization = initialization; + this.lambda = lambda; + this.convergence = convergence; + this.maxiter = maxiter; + } + + /** + * Perform affinity propagation clustering. + * + * @param db Database + * @param relation Relation + * @return Clustering result + */ + public Clustering<MedoidModel> run(Database db, Relation<O> relation) { + ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs()); + final int size = ids.size(); + + int[] assignment = new int[size]; + double[][] s = initialization.getSimilarityMatrix(db, relation, ids); + double[][] r = new double[size][size]; + double[][] a = new double[size][size]; + + IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("Affinity Propagation Iteration", LOG) : null; + MutableProgress aprog = LOG.isVerbose() ? new MutableProgress("Stable assignments", size + 1, LOG) : null; + + int inactive = 0; + for(int iteration = 0; iteration < maxiter && inactive < convergence; iteration++) { + // Update responsibility matrix: + for(int i = 0; i < size; i++) { + double[] ai = a[i], ri = r[i], si = s[i]; + // Find the two largest values (as initially maxk == i) + double max1 = Double.NEGATIVE_INFINITY, max2 = Double.NEGATIVE_INFINITY; + int maxk = -1; + for(int k = 0; k < size; k++) { + double val = ai[k] + si[k]; + if(val > max1) { + max2 = max1; + max1 = val; + maxk = k; + } + else if(val > max2) { + max2 = val; + } + } + // With the maximum value known, update r: + for(int k = 0; k < size; k++) { + double val = si[k] - ((k != maxk) ? max1 : max2); + ri[k] = ri[k] * lambda + val * (1. - lambda); + } + } + // Update availability matrix + for(int k = 0; k < size; k++) { + // Compute sum of max(0, r_ik) for all i. + // For r_kk, don't apply the max. + double colposum = 0.; + for(int i = 0; i < size; i++) { + if(i == k || r[i][k] > 0.) { + colposum += r[i][k]; + } + } + for(int i = 0; i < size; i++) { + double val = colposum; + // Adjust column sum by the one extra term. + if(i == k || r[i][k] > 0.) { + val -= r[i][k]; + } + if(i != k && val > 0.) { // min + val = 0.; + } + a[i][k] = a[i][k] * lambda + val * (1 - lambda); + } + } + int changed = 0; + for(int i = 0; i < size; i++) { + double[] ai = a[i], ri = r[i]; + double max = Double.NEGATIVE_INFINITY; + int maxj = -1; + for(int j = 0; j < size; j++) { + double v = ai[j] + ri[j]; + if(v > max || (i == j && v >= max)) { + max = v; + maxj = j; + } + } + if(assignment[i] != maxj) { + changed += 1; + assignment[i] = maxj; + } + } + inactive = (changed > 0) ? 0 : (inactive + 1); + if(prog != null) { + prog.incrementProcessed(LOG); + } + if(aprog != null) { + aprog.setProcessed(size - changed, LOG); + } + } + if(aprog != null) { + aprog.setProcessed(aprog.getTotal(), LOG); + } + if(prog != null) { + prog.setCompleted(LOG); + } + // Cluster map, by lead object + TIntObjectHashMap<ModifiableDBIDs> map = new TIntObjectHashMap<>(); + DBIDArrayIter i1 = ids.iter(); + for(int i = 0; i1.valid(); i1.advance(), i++) { + int c = assignment[i]; + // Add to cluster members: + ModifiableDBIDs cids = map.get(c); + if(cids == null) { + cids = DBIDUtil.newArray(); + map.put(c, cids); + } + cids.add(i1); + } + // If we stopped early, the cluster lead might be in a different cluster. + for(TIntObjectIterator<ModifiableDBIDs> iter = map.iterator(); iter.hasNext();) { + iter.advance(); // Trove iterator; advance first! + final int key = iter.key(); + int targetkey = key; + ModifiableDBIDs tids = null; + // Chase arrows: + while(ids == null && assignment[targetkey] != targetkey) { + targetkey = assignment[targetkey]; + tids = map.get(targetkey); + } + if(tids != null && targetkey != key) { + tids.addDBIDs(iter.value()); + iter.remove(); + } + } + + Clustering<MedoidModel> clustering = new Clustering<>("Affinity Propagation Clustering", "ap-clustering"); + ModifiableDBIDs noise = DBIDUtil.newArray(); + for(TIntObjectIterator<ModifiableDBIDs> iter = map.iterator(); iter.hasNext();) { + iter.advance(); // Trove iterator; advance first! + i1.seek(iter.key()); + if(iter.value().size() > 1) { + MedoidModel mod = new MedoidModel(DBIDUtil.deref(i1)); + clustering.addToplevelCluster(new Cluster<>(iter.value(), mod)); + } + else { + noise.add(i1); + } + } + if(noise.size() > 0) { + MedoidModel mod = new MedoidModel(DBIDUtil.deref(noise.iter())); + clustering.addToplevelCluster(new Cluster<>(noise, true, mod)); + } + return clustering; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(initialization.getInputTypeRestriction()); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <O> object type + */ + public static class Parameterizer<O> extends AbstractParameterizer { + /** + * Parameter for the similarity matrix initialization + */ + public static final OptionID INITIALIZATION_ID = new OptionID("ap.initialization", "Similarity matrix initialization.."); + + /** + * Parameter for the dampening factor. + */ + public static final OptionID LAMBDA_ID = new OptionID("ap.lambda", "Dampening factor lambda. Usually 0.5 to 1."); + + /** + * Parameter for the convergence factor. + */ + public static final OptionID CONVERGENCE_ID = new OptionID("ap.convergence", "Number of stable iterations for convergence."); + + /** + * Parameter for the convergence factor. + */ + public static final OptionID MAXITER_ID = new OptionID("ap.maxiter", "Maximum number of iterations."); + + /** + * Initialization function for the similarity matrix. + */ + AffinityPropagationInitialization<O> initialization; + + /** + * Dampening parameter. + */ + double lambda = .5; + + /** + * Number of stable iterations for convergence. + */ + int convergence; + + /** + * Maximum number of iterations. + */ + int maxiter; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + final ObjectParameter<AffinityPropagationInitialization<O>> param = new ObjectParameter<>(INITIALIZATION_ID, AffinityPropagationInitialization.class, DistanceBasedInitializationWithMedian.class); + if(config.grab(param)) { + initialization = param.instantiateClass(config); + } + final DoubleParameter lambdaP = new DoubleParameter(LAMBDA_ID, .5); + lambdaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); + lambdaP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE); + if(config.grab(lambdaP)) { + lambda = lambdaP.doubleValue(); + } + final IntParameter convergenceP = new IntParameter(CONVERGENCE_ID, 15); + convergenceP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(convergenceP)) { + convergence = convergenceP.intValue(); + } + final IntParameter maxiterP = new IntParameter(MAXITER_ID, 1000); + if(config.grab(maxiterP)) { + maxiter = maxiterP.intValue(); + } + } + + @Override + protected AffinityPropagationClusteringAlgorithm<O> makeInstance() { + return new AffinityPropagationClusteringAlgorithm<>(initialization, lambda, convergence, maxiter); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationInitialization.java new file mode 100644 index 00000000..5dbc54de --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationInitialization.java @@ -0,0 +1,59 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.affinitypropagation; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable; + +/** + * Initialization methods for affinity propagation. + * + * @author Erich Schubert + */ +public interface AffinityPropagationInitialization<O> extends Parameterizable { + /** + * Quantile to use for the diagonal entries. + */ + public static final OptionID QUANTILE_ID = new OptionID("ap.quantile", "Quantile to use for diagonal entries."); + + /** + * Compute the initial similarity matrix. + * + * @param db Database + * @param relation Data relation + * @param ids indexed DBIDs + * @return Similarity matrix + */ + double[][] getSimilarityMatrix(Database db, Relation<O> relation, ArrayDBIDs ids); + + /** + * Get the data type information for the similarity computations. + * + * @return Data type + */ + TypeInformation getInputTypeRestriction(); +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/DistanceBasedInitializationWithMedian.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/DistanceBasedInitializationWithMedian.java new file mode 100644 index 00000000..2c8cabf9 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/DistanceBasedInitializationWithMedian.java @@ -0,0 +1,148 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.affinitypropagation; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Distance based initialization. + * + * @author Erich Schubert + * + * @param <O> Object type + * @param <D> Distance type + */ +public class DistanceBasedInitializationWithMedian<O, D extends NumberDistance<D, ?>> implements AffinityPropagationInitialization<O> { + /** + * Distance function. + */ + DistanceFunction<? super O, D> distance; + + /** + * Quantile to use. + */ + double quantile; + + /** + * Constructor. + * + * @param distance Similarity function + * @param quantile Quantile + */ + public DistanceBasedInitializationWithMedian(DistanceFunction<? super O, D> distance, double quantile) { + super(); + this.distance = distance; + this.quantile = quantile; + } + + @Override + public double[][] getSimilarityMatrix(Database db, Relation<O> relation, ArrayDBIDs ids) { + final int size = ids.size(); + DistanceQuery<O, D> dq = db.getDistanceQuery(relation, distance); + double[][] mat = new double[size][size]; + double[] flat = new double[(size * (size - 1)) >> 1]; + // TODO: optimize for double valued primitive distances. + DBIDArrayIter i1 = ids.iter(), i2 = ids.iter(); + for (int i = 0, j = 0; i < size; i++, i1.advance()) { + double[] mati = mat[i]; + i2.seek(i + 1); + for (int k = i + 1; k < size; k++, i2.advance()) { + mati[k] = -dq.distance(i1, i2).doubleValue(); + mat[k][i] = mati[k]; // symmetry. + flat[j] = mati[k]; + j++; + } + } + double median = QuickSelect.quantile(flat, quantile); + // On the diagonal, we place the median + for (int i = 0; i < size; i++) { + mat[i][i] = median; + } + return mat; + } + + @Override + public TypeInformation getInputTypeRestriction() { + return distance.getInputTypeRestriction(); + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <O> Object type + * @param <D> Distance type + */ + public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractParameterizer { + /** + * Parameter for the distance function. + */ + public static final OptionID DISTANCE_ID = new OptionID("ap.distance", "Distance function to use."); + + /** + * istance function. + */ + DistanceFunction<? super O, D> distance; + + /** + * Quantile to use. + */ + double quantile; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + ObjectParameter<DistanceFunction<? super O, D>> param = new ObjectParameter<>(DISTANCE_ID, DistanceFunction.class, SquaredEuclideanDistanceFunction.class); + if (config.grab(param)) { + distance = param.instantiateClass(config); + } + + DoubleParameter quantileP = new DoubleParameter(QUANTILE_ID, .5); + if (config.grab(quantileP)) { + quantile = quantileP.doubleValue(); + } + } + + @Override + protected DistanceBasedInitializationWithMedian<O, D> makeInstance() { + return new DistanceBasedInitializationWithMedian<>(distance, quantile); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/SimilarityBasedInitializationWithMedian.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/SimilarityBasedInitializationWithMedian.java new file mode 100644 index 00000000..a138da96 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/SimilarityBasedInitializationWithMedian.java @@ -0,0 +1,153 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.affinitypropagation; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; +import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.distance.similarityfunction.SimilarityFunction; +import de.lmu.ifi.dbs.elki.distance.similarityfunction.kernel.LinearKernelFunction; +import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Similarity based initialization. + * + * @author Erich Schubert + * + * @param <O> Object type + * @param <D> Distance type + */ +public class SimilarityBasedInitializationWithMedian<O, D extends NumberDistance<D, ?>> implements AffinityPropagationInitialization<O> { + /** + * Similarity function. + */ + SimilarityFunction<? super O, D> similarity; + + /** + * Quantile to use. + */ + double quantile; + + /** + * Constructor. + * + * @param similarity Similarity function + * @param quantile Quantile + */ + public SimilarityBasedInitializationWithMedian(SimilarityFunction<? super O, D> similarity, double quantile) { + super(); + this.similarity = similarity; + this.quantile = quantile; + } + + @Override + public double[][] getSimilarityMatrix(Database db, Relation<O> relation, ArrayDBIDs ids) { + final int size = ids.size(); + SimilarityQuery<O, D> sq = db.getSimilarityQuery(relation, similarity); + double[][] mat = new double[size][size]; + double[] flat = new double[(size * (size - 1)) >> 1]; + // TODO: optimize for double valued primitive distances. + DBIDArrayIter i1 = ids.iter(), i2 = ids.iter(); + // Compute self-similarities first, for centering: + for (int i = 0; i < size; i++, i1.advance()) { + mat[i][i] = sq.similarity(i1, i1).doubleValue() * .5; + } + i1.seek(0); + for (int i = 0, j = 0; i < size; i++, i1.advance()) { + final double[] mati = mat[i]; // Probably faster access. + i2.seek(i + 1); + for (int k = i + 1; k < size; k++, i2.advance()) { + mati[k] = sq.similarity(i1, i2).doubleValue() - mati[i] - mat[k][k]; + mat[k][i] = mati[k]; // symmetry. + flat[j] = mati[k]; + j++; + } + } + double median = QuickSelect.quantile(flat, quantile); + // On the diagonal, we place the median + for (int i = 0; i < size; i++) { + mat[i][i] = median; + } + return mat; + } + + @Override + public TypeInformation getInputTypeRestriction() { + return similarity.getInputTypeRestriction(); + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <O> Object type + * @param <D> Distance type + */ + public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractParameterizer { + /** + * Parameter for the similarity function. + */ + public static final OptionID SIMILARITY_ID = new OptionID("ap.similarity", "Similarity function to use."); + + /** + * Similarity function. + */ + SimilarityFunction<? super O, D> similarity; + + /** + * Quantile to use. + */ + double quantile; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + ObjectParameter<SimilarityFunction<? super O, D>> param = new ObjectParameter<>(SIMILARITY_ID, SimilarityFunction.class, LinearKernelFunction.class); + if (config.grab(param)) { + similarity = param.instantiateClass(config); + } + + DoubleParameter quantileP = new DoubleParameter(QUANTILE_ID, .5); + if (config.grab(quantileP)) { + quantile = quantileP.doubleValue(); + } + } + + @Override + protected SimilarityBasedInitializationWithMedian<O, D> makeInstance() { + return new SimilarityBasedInitializationWithMedian<>(similarity, quantile); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/package-info.java new file mode 100644 index 00000000..bc6059ac --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/package-info.java @@ -0,0 +1,27 @@ +/** + * Affinity Propagation (AP) clustering. + */ + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +package de.lmu.ifi.dbs.elki.algorithm.clustering.affinitypropagation;
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/AbstractBiclustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/AbstractBiclustering.java new file mode 100644 index 00000000..8b875340 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/AbstractBiclustering.java @@ -0,0 +1,302 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.biclustering; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.BitSet; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.BiclusterModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; +import de.lmu.ifi.dbs.elki.utilities.BitsUtil; +import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages; + +/** + * Abstract class as a convenience for different biclustering approaches. + * <p/> + * The typically required values describing submatrices are computed using the + * corresponding values within a database of NumberVectors. + * <p/> + * The database is supposed to present a data matrix with a row representing an + * entry ({@link NumberVector}), a column representing a dimension (attribute) + * of the {@link NumberVector}s. + * + * @author Arthur Zimek + * @param <V> a certain subtype of NumberVector - the data matrix is supposed to + * consist of rows where each row relates to an object of type V and the + * columns relate to the attribute values of these objects + * @param <M> Cluster model type + */ +public abstract class AbstractBiclustering<V extends NumberVector<?>, M extends BiclusterModel> extends AbstractAlgorithm<Clustering<M>> implements ClusteringAlgorithm<Clustering<M>> { + /** + * Keeps the currently set database. + */ + private Database database; + + /** + * Relation we use. + */ + protected Relation<V> relation; + + /** + * Iterator to use for more efficient random access. + */ + private DBIDArrayIter iter; + + /** + * The row ids corresponding to the currently set {@link #relation}. + */ + protected ArrayDBIDs rowIDs; + + /** + * Column dimensionality. + */ + private int colDim; + + /** + * Constructor. + */ + protected AbstractBiclustering() { + super(); + } + + /** + * Prepares the algorithm for running on a specific database. + * <p/> + * Assigns the database, the row ids, and the col ids, then calls + * {@link #biclustering()}. + * <p/> + * Any concrete algorithm should be implemented within method + * {@link #biclustering()} by an inheriting biclustering approach. + * + * @param relation Relation to process + * @return Clustering result + */ + public final Clustering<M> run(Relation<V> relation) { + this.relation = relation; + if (this.relation == null || this.relation.size() == 0) { + throw new IllegalArgumentException(ExceptionMessages.DATABASE_EMPTY); + } + colDim = RelationUtil.dimensionality(relation); + rowIDs = DBIDUtil.ensureArray(this.relation.getDBIDs()); + iter = rowIDs.iter(); + return biclustering(); + } + + /** + * Run the actual biclustering algorithm. + * <p/> + * This method is supposed to be called only from the method + * {@link #run}. + * <p/> + */ + protected abstract Clustering<M> biclustering(); + + /** + * Convert a bitset into integer column ids. + * + * @param cols + * @return integer column ids + */ + protected int[] colsBitsetToIDs(BitSet cols) { + int[] colIDs = new int[cols.cardinality()]; + int colsIndex = 0; + for (int i = cols.nextSetBit(0); i >= 0; i = cols.nextSetBit(i + 1)) { + colIDs[colsIndex] = i; + colsIndex++; + } + return colIDs; + } + + /** + * Convert a bitset into integer row ids. + * + * @param rows + * @return integer row ids + */ + protected ArrayDBIDs rowsBitsetToIDs(BitSet rows) { + ArrayModifiableDBIDs rowIDs = DBIDUtil.newArray(rows.cardinality()); + DBIDArrayIter iter = this.rowIDs.iter(); + for (int i = rows.nextSetBit(0); i >= 0; i = rows.nextSetBit(i + 1)) { + iter.seek(i); + rowIDs.add(iter); + } + return rowIDs; + } + + /** + * Defines a Bicluster as given by the included rows and columns. + * + * @param rows the rows included in the Bicluster + * @param cols the columns included in the Bicluster + * @return a Bicluster as given by the included rows and columns + */ + protected Cluster<BiclusterModel> defineBicluster(BitSet rows, BitSet cols) { + ArrayDBIDs rowIDs = rowsBitsetToIDs(rows); + int[] colIDs = colsBitsetToIDs(cols); + return new Cluster<>(rowIDs, new BiclusterModel(colIDs)); + } + + /** + * Defines a Bicluster as given by the included rows and columns. + * + * @param rows the rows included in the Bicluster + * @param cols the columns included in the Bicluster + * @return A Bicluster as given by the included rows and columns + */ + protected Cluster<BiclusterModel> defineBicluster(long[] rows, long[] cols) { + ArrayDBIDs rowIDs = rowsBitsetToIDs(rows); + int[] colIDs = colsBitsetToIDs(cols); + return new Cluster<>(rowIDs, new BiclusterModel(colIDs)); + } + + /** + * Returns the value of the data matrix at row <code>row</code> and column + * <code>col</code>. + * + * @param row the row in the data matrix according to the current order of + * rows (refers to database entry + * <code>database.get(rowIDs[row])</code>) + * @param col the column in the data matrix according to the current order of + * rows (refers to the attribute value of an database entry + * <code>getValue(colIDs[col])</code>) + * @return the attribute value of the database entry as retrieved by + * <code>database.get(rowIDs[row]).getValue(colIDs[col])</code> + */ + protected double valueAt(int row, int col) { + iter.seek(row); + return relation.get(iter).doubleValue(col); + } + + /** + * Get the DBID of a certain row + * + * @param row Row number + * @return DBID of this row + * @deprecated Expensive! + */ + @Deprecated + protected DBID getRowDBID(int row) { + return rowIDs.get(row); + } + + /** + * Convert a bitset into integer column ids. + * + * @param cols + * @return integer column ids + */ + protected int[] colsBitsetToIDs(long[] cols) { + int[] colIDs = new int[(int) BitsUtil.cardinality(cols)]; + int colsIndex = 0; + for (int cpos = 0, clpos = 0; clpos < cols.length; ++clpos) { + long clong = cols[clpos]; + if (clong == 0L) { + cpos += Long.SIZE; + continue; + } + for (int j = 0; j < Long.SIZE; ++j, ++cpos, clong >>>= 1) { + if ((clong & 1L) == 1L) { + colIDs[colsIndex] = cpos; + ++colsIndex; + } + } + } + return colIDs; + } + + /** + * Convert a bitset into integer row ids. + * + * @param rows + * @return integer row ids + */ + protected ArrayDBIDs rowsBitsetToIDs(long[] rows) { + ArrayModifiableDBIDs rowIDs = DBIDUtil.newArray((int) BitsUtil.cardinality(rows)); + DBIDArrayIter iter = this.rowIDs.iter(); + outer: for (int rlpos = 0; rlpos < rows.length; ++rlpos) { + long rlong = rows[rlpos]; + // Fast skip blocks of 64 masked values. + if (rlong == 0L) { + iter.advance(Long.SIZE); + continue; + } + for (int i = 0; i < Long.SIZE; ++i, rlong >>>= 1, iter.advance()) { + if (!iter.valid()) { + break outer; + } + if ((rlong & 1L) == 1L) { + rowIDs.add(iter); + } + } + } + return rowIDs; + } + + /** + * Provides the number of rows of the data matrix. + * + * @return the number of rows of the data matrix + */ + protected int getRowDim() { + return this.rowIDs.size(); + } + + /** + * Provides the number of columns of the data matrix. + * + * @return the number of columns of the data matrix + */ + protected int getColDim() { + return colDim; + } + + /** + * Getter for database. + * + * @return database + */ + public Database getDatabase() { + return database; + } + + /** + * Getter for the relation. + * + * @return relation + */ + public Relation<V> getRelation() { + return relation; + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java new file mode 100644 index 00000000..e110faff --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java @@ -0,0 +1,900 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.biclustering; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.Arrays; + +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.BiclusterWithInversionsModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.math.Mean; +import de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution; +import de.lmu.ifi.dbs.elki.math.statistics.distribution.UniformDistribution; +import de.lmu.ifi.dbs.elki.utilities.BitsUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Perform Cheng and Church biclustering. + * + * <p> + * Reference: <br> + * Y. Cheng and G. M. Church. Biclustering of expression data. In Proceedings of + * the 8th International Conference on Intelligent Systems for Molecular Biology + * (ISMB), San Diego, CA, 2000. + * </p> + * + * @author Erich Schubert + * @param <V> Vector type. + */ +@Reference(authors = "Y. Cheng, G. M. Church", title = "Biclustering of expression data", booktitle = "Proc. 8th International Conference on Intelligent Systems for Molecular Biology (ISMB)") +public class ChengAndChurch<V extends NumberVector<?>> extends AbstractBiclustering<V, BiclusterWithInversionsModel> { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(ChengAndChurch.class); + + /** + * The minimum number of columns that the database must have so that a removal + * of columns is performed in {@link #multipleNodeDeletion}.</p> + * <p> + * Just start deleting multiple columns when more than 100 columns are in the + * data matrix. + * </p> + */ + private static final int MIN_COLUMN_REMOVE_THRESHOLD = 100; + + /** + * The minimum number of rows that the database must have so that a removal of + * rows is performed in {@link #multipleNodeDeletion}. + * <p> + * Just start deleting multiple rows when more than 100 rows are in the data + * matrix. + * </p> + * <!-- + * <p> + * The value is set to 100 as this is not really described in the paper. + * </p> + * --> + */ + private static final int MIN_ROW_REMOVE_THRESHOLD = 100; + + /** + * Threshold for the score. + */ + private double delta; + + /** + * The parameter for multiple node deletion.</p> + * <p> + * It is used to magnify the {@link #delta} value in the + * {@link #multipleNodeDeletion} method. + * </p> + */ + private double alpha; + + /** + * Number of biclusters to be found. + */ + private int n; + + /** + * Allow inversion of rows in the last phase. + */ + private boolean useinverted = true; + + /** + * Distribution to sample random replacement values from. + */ + private Distribution dist; + + /** + * Constructor. + * + * @param delta Delta parameter: desired quality + * @param alpha Alpha parameter: controls switching to single node deletion + * approach + * @param n Number of clusters to detect + * @param dist Distribution of random values to insert + */ + public ChengAndChurch(double delta, double alpha, int n, Distribution dist) { + super(); + this.delta = delta; + this.alpha = alpha; + this.n = n; + this.dist = dist; + } + + /** + * Visitor pattern for processing cells. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static interface CellVisitor { + /** Different modes of operation. */ + int ALL = 0, SELECTED = 1, NOT_SELECTED = 2; + + /** + * Visit a cell. + * + * @param val Value + * @param row Row Number + * @param col Column number + * @param selrow Boolean, whether row is selected + * @param selcol Boolean, whether column is selected + * @return Stop flag, return {@code true} to stop visiting + */ + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol); + } + + /** + * Bicluster candidate. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + protected static class BiclusterCandidate { + /** + * Cardinalities. + */ + int rowcard, colcard; + + /** + * Means. + */ + double[] rowM, colM; + + /** + * Row and column bitmasks. + */ + long[] rows, irow, cols; + + /** + * Mean of the current bicluster. + */ + double allM; + + /** + * The current bicluster score (mean squared residue). + */ + double residue; + + /** + * Constructor. + * + * @param rows Row dimensionality. + * @param cols Column dimensionality. + */ + protected BiclusterCandidate(int rows, int cols) { + super(); + this.rows = BitsUtil.ones(rows); + this.irow = BitsUtil.zero(rows); + this.rowcard = rows; + this.rowM = new double[rows]; + this.cols = BitsUtil.ones(cols); + this.colcard = cols; + this.colM = new double[cols]; + } + + /** + * Resets the values for the next cluster search. + */ + protected void reset() { + rows = BitsUtil.ones(rowM.length); + rowcard = rowM.length; + cols = BitsUtil.ones(colM.length); + colcard = colM.length; + BitsUtil.zeroI(irow); + } + + /** + * Visit all selected cells in the data matrix. + * + * @param mat Data matrix + * @param mode Operation mode + * @param visitor Visitor function + */ + protected void visitAll(double[][] mat, int mode, CellVisitor visitor) { + // For efficiency, we manually iterate over the rows and column bitmasks. + // This saves repeated shifting needed by the manual bit access. + for(int rpos = 0, rlpos = 0; rlpos < rows.length; ++rlpos) { + long rlong = rows[rlpos]; + // Fast skip blocks of 64 masked values. + if((mode == CellVisitor.SELECTED && rlong == 0L) || (mode == CellVisitor.NOT_SELECTED && rlong == -1L)) { + rpos += Long.SIZE; + continue; + } + for(int i = 0; i < Long.SIZE && rpos < rowM.length; ++i, ++rpos, rlong >>>= 1) { + boolean rselected = ((rlong & 1L) == 1L); + if((mode == CellVisitor.SELECTED && !rselected) || (mode == CellVisitor.NOT_SELECTED && rselected)) { + continue; + } + for(int cpos = 0, clpos = 0; clpos < cols.length; ++clpos) { + long clong = cols[clpos]; + if((mode == CellVisitor.SELECTED && clong == 0L) || (mode == CellVisitor.NOT_SELECTED && clong == -1L)) { + cpos += Long.SIZE; + continue; + } + for(int j = 0; j < Long.SIZE && cpos < colM.length; ++j, ++cpos, clong >>>= 1) { + boolean cselected = ((clong & 1L) == 1L); + if((mode == CellVisitor.SELECTED && !cselected) || (mode == CellVisitor.NOT_SELECTED && cselected)) { + continue; + } + boolean stop = visitor.visit(mat[rpos][cpos], rpos, cpos, rselected, cselected); + if(stop) { + return; + } + } + } + } + } + } + + /** + * Visit a column of the matrix. + * + * @param mat Data matrix + * @param col Column to visit + * @param mode Operation mode + * @param visitor Visitor function + */ + protected void visitColumn(double[][] mat, int col, int mode, CellVisitor visitor) { + boolean cselected = BitsUtil.get(cols, col); + // For efficiency, we manually iterate over the rows and column bitmasks. + // This saves repeated shifting needed by the manual bit access. + for(int rpos = 0, rlpos = 0; rlpos < rows.length; ++rlpos) { + long rlong = rows[rlpos]; + // Fast skip blocks of 64 masked values. + if(mode == CellVisitor.SELECTED && rlong == 0L) { + rpos += Long.SIZE; + continue; + } + if(mode == CellVisitor.NOT_SELECTED && rlong == -1L) { + rpos += Long.SIZE; + continue; + } + for(int i = 0; i < Long.SIZE && rpos < rowM.length; ++i, ++rpos, rlong >>>= 1) { + boolean rselected = ((rlong & 1L) == 1L); + if(mode == CellVisitor.SELECTED && !rselected) { + continue; + } + if(mode == CellVisitor.NOT_SELECTED && rselected) { + continue; + } + boolean stop = visitor.visit(mat[rpos][col], rpos, col, rselected, cselected); + if(stop) { + return; + } + } + } + } + + /** + * Visit a row of the data matrix. + * + * @param mat Data matrix + * @param row Row to visit + * @param visitor Visitor function + */ + protected void visitRow(double[][] mat, int row, int mode, CellVisitor visitor) { + boolean rselected = BitsUtil.get(rows, row); + final double[] rowdata = mat[row]; + for(int cpos = 0, clpos = 0; clpos < cols.length; ++clpos) { + long clong = cols[clpos]; + // Fast skip blocks of 64 masked values. + if(mode == CellVisitor.SELECTED && clong == 0L) { + cpos += Long.SIZE; + continue; + } + if(mode == CellVisitor.NOT_SELECTED && clong == -1L) { + cpos += Long.SIZE; + continue; + } + for(int j = 0; j < Long.SIZE && cpos < colM.length; ++j, ++cpos, clong >>>= 1) { + boolean cselected = ((clong & 1L) == 1L); + if(mode == CellVisitor.SELECTED && !cselected) { + continue; + } + if(mode == CellVisitor.NOT_SELECTED && cselected) { + continue; + } + boolean stop = visitor.visit(rowdata[cpos], row, cpos, rselected, cselected); + if(stop) { + return; + } + } + } + } + + /** Visitor for updating the means. */ + private final CellVisitor MEANVISITOR = new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + if(selcol) { + rowM[row] += val; + } + if(selrow) { + colM[col] += val; + } + if(selcol && selrow) { + allM += val; + } + return false; + } + }; + + /** + * Update the row means and column means. + * + * @param mat Data matrix + * @param all Flag, to update all + * @return overall mean + */ + protected double updateRowAndColumnMeans(final double[][] mat, boolean all) { + final int mode = all ? CellVisitor.ALL : CellVisitor.SELECTED; + Arrays.fill(rowM, 0.); + Arrays.fill(colM, 0.); + allM = 0.; + visitAll(mat, mode, MEANVISITOR); + visitColumn(mat, 0, mode, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + rowM[row] /= colcard; + return false; + } + }); + visitRow(mat, 0, mode, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + colM[col] /= rowcard; + return false; + } + }); + allM /= colcard * rowcard; + return allM; + } + + /** + * Compute the mean square residue. + * + * @param mat Data matrix + * @return mean squared residue + */ + protected double computeMeanSquaredDeviation(final double[][] mat) { + final Mean msr = new Mean(); + visitAll(mat, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selrow && selcol); + double v = val - rowM[row] - colM[col] + allM; + msr.put(v * v); + return false; + } + }); + residue = msr.getMean(); + return residue; + } + + /** + * Computes the <b>mean row residue</b> of the given <code>row</code>. + * + * @param mat Data matrix + * @param row The row who's residue should be computed. + * @param rowinverted Indicates if the row should be considered inverted. + * @return The row residue of the given <code>row</code>. + */ + protected double computeRowResidue(final double[][] mat, int row, final boolean rowinverted) { + final Mean rowResidue = new Mean(); + visitRow(mat, row, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selcol); + final double rowMean = rowM[row]; + final double colMean = colM[col]; + double v = ((!rowinverted) ? (val - rowMean) : (rowMean - val)) - colMean + allM; + rowResidue.put(v * v); + return false; + } + }); + return rowResidue.getMean(); + } + + /** + * + * Computes the <b>mean column residue</b> of the given <code>col</code>. + * + * @param col The column who's residue should be computed. + * @return The row residue of the given <code>col</code>um. + */ + protected double computeColResidue(final double[][] mat, final int col) { + final double bias = colM[col] - allM; + final Mean colResidue = new Mean(); + visitColumn(mat, col, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selrow); + final double rowMean = rowM[row]; + double v = val - rowMean - bias; + colResidue.put(v * v); + return false; + } + }); + return colResidue.getMean(); + } + + /** + * Updates the mask with replacement values for all data in the given rows + * and columns. + * + * @param mat Mask to update. + * @param replacement Distribution to sample replacement values from. + */ + protected void maskMatrix(final double[][] mat, final Distribution replacement) { + visitAll(mat, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selrow && selcol); + mat[row][col] = replacement.nextRandom(); + return false; + } + }); + } + + /** + * Select or deselect a column. + * + * @param cnum Column to select + * @param set Value to set + */ + protected void selectColumn(int cnum, boolean set) { + if(set) { + BitsUtil.setI(cols, cnum); + colcard++; + } + else { + BitsUtil.clearI(cols, cnum); + colcard--; + } + } + + /** + * Select or deselect a row. + * + * @param rnum Row to select + * @param set Value to set + */ + protected void selectRow(int rnum, boolean set) { + if(set) { + BitsUtil.setI(rows, rnum); + rowcard++; + } + else { + BitsUtil.clearI(rows, rnum); + rowcard--; + } + } + + protected void invertRow(int rnum, boolean b) { + BitsUtil.setI(irow, rnum); + } + } + + @Override + public Clustering<BiclusterWithInversionsModel> biclustering() { + double[][] mat = RelationUtil.relationAsMatrix(relation, rowIDs); + + BiclusterCandidate cand = new BiclusterCandidate(getRowDim(), getColDim()); + + Clustering<BiclusterWithInversionsModel> result = new Clustering<>("Cheng-and-Church", "Cheng and Church Biclustering"); + ModifiableDBIDs noise = DBIDUtil.newHashSet(relation.getDBIDs()); + + FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Extracting Cluster", n, LOG) : null; + for(int i = 0; i < n; i++) { + cand.reset(); + multipleNodeDeletion(mat, cand); + if(LOG.isVeryVerbose()) { + LOG.veryverbose("Residue after Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); + } + singleNodeDeletion(mat, cand); + if(LOG.isVeryVerbose()) { + LOG.veryverbose("Residue after Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); + } + nodeAddition(mat, cand); + if(LOG.isVeryVerbose()) { + LOG.veryverbose("Residue after Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); + } + cand.maskMatrix(mat, dist); + BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(cand.cols), rowsBitsetToIDs(cand.irow)); + final ArrayDBIDs cids = rowsBitsetToIDs(cand.rows); + noise.removeDBIDs(cids); + result.addToplevelCluster(new Cluster<>(cids, model)); + + if(LOG.isVerbose()) { + LOG.verbose("Score of bicluster " + (i + 1) + ": " + cand.residue + "\n"); + LOG.verbose("Number of rows: " + cand.rowcard + "\n"); + LOG.verbose("Number of columns: " + cand.colcard + "\n"); + // LOG.verbose("Total number of masked values: " + maskedVals.size() + + // "\n"); + } + if(prog != null) { + prog.incrementProcessed(LOG); + } + } + // Add a noise cluster, full-dimensional. + if(!noise.isEmpty()) { + long[] allcols = BitsUtil.ones(getColDim()); + BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(allcols), DBIDUtil.EMPTYDBIDS); + result.addToplevelCluster(new Cluster<>(noise, true, model)); + } + if(prog != null) { + prog.ensureCompleted(LOG); + } + return result; + } + + /** + * Algorithm 1 of Cheng and Church: + * + * Remove single rows or columns. + * + * Inverted rows are not supported in this method. + * + * @param mat Data matrix + * @param cand Bicluster candidate + */ + private void singleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) { + // Assume that cand.residue is up to date! + while(cand.residue > delta && (cand.colcard > 2 || cand.rowcard > 2)) { + // Store current maximum. Need final mutable, so use arrays. + final double[] max = { Double.NEGATIVE_INFINITY }; + final int[] best = { -1, -1 }; + + // Test rows + if(cand.rowcard > 2) { + cand.visitColumn(mat, 0, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selrow); + double rowResidue = cand.computeRowResidue(mat, row, false); + if(max[0] < rowResidue) { + max[0] = rowResidue; + best[0] = row; + } + return false; + } + }); + } + + // Test columns: + if(cand.colcard > 2) { + cand.visitRow(mat, 0, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selcol); + double colResidue = cand.computeColResidue(mat, col); + if(max[0] < colResidue) { + max[0] = colResidue; + best[1] = col; + } + return false; + } + }); + } + + if(best[1] >= 0) { // then override bestrow! + cand.selectColumn(best[1], false); + } + else { + assert (best[0] >= 0); + cand.selectRow(best[0], false); + } + // TODO: incremental update could be much faster? + cand.updateRowAndColumnMeans(mat, false); + cand.computeMeanSquaredDeviation(mat); + if(LOG.isDebuggingFine()) { + LOG.debugFine("Residue in Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); + } + } + } + + // + /** + * Algorithm 2 of Cheng and Church. + * + * Remove all rows and columns that reduce the residue by alpha. + * + * Inverted rows are not supported in this method. + * + * @param mat Data matrix + * @param cand Bicluster candidate + */ + private void multipleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) { + cand.updateRowAndColumnMeans(mat, false); + cand.computeMeanSquaredDeviation(mat); + + // Note: assumes that cand.residue = H(I,J) + while(cand.residue > delta) { + final boolean[] modified = { false, false }; + + // Step 2: remove rows above threshold + if(cand.rowcard > MIN_ROW_REMOVE_THRESHOLD) { + final double alphaResidue = alpha * cand.residue; + cand.visitColumn(mat, 0, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selrow); + if(cand.computeRowResidue(mat, row, false) > alphaResidue) { + cand.selectRow(row, false); + modified[0] = true; + } + return (cand.rowcard > MIN_ROW_REMOVE_THRESHOLD); + } + }); + + // Step 3: update residue + if(modified[0]) { + cand.updateRowAndColumnMeans(mat, false); + cand.computeMeanSquaredDeviation(mat); + } + } + + // Step 4: remove columns above threshold + if(cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD) { + final double alphaResidue = alpha * cand.residue; + cand.visitRow(mat, 0, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selcol); + if(cand.computeColResidue(mat, col) > alphaResidue) { + cand.selectColumn(col, false); + modified[1] = true; + } + return (cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD); + } + }); + if(modified[1]) { + cand.updateRowAndColumnMeans(mat, false); + cand.computeMeanSquaredDeviation(mat); + } + } + + if(LOG.isDebuggingFine()) { + LOG.debugFine("Residue in Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); + } + // Step 5: if nothing has been removed, try removing single nodes. + if(!modified[0] && !modified[1]) { + break; + // Will be executed next in main loop, as per algorithm 4. + // singleNodeDeletion(); + } + } + } + + /** + * Algorithm 3 of Cheng and Church. + * + * Try to re-add rows or columns that decrease the overall score. + * + * Also try adding inverted rows. + * + * @param mat Data matrix + * @param cand Bicluster candidate + */ + private void nodeAddition(final double[][] mat, final BiclusterCandidate cand) { + cand.updateRowAndColumnMeans(mat, true); + cand.computeMeanSquaredDeviation(mat); + while(true) { + // We need this to be final + mutable + final boolean[] added = new boolean[] { false, false }; + + // Step 2: add columns + cand.visitRow(mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (!selcol); + if(cand.computeColResidue(mat, col) <= cand.residue) { + cand.selectColumn(col, true); + added[0] = true; + } + return false; + } + }); + + // Step 3: recompute values + if(added[0]) { + cand.updateRowAndColumnMeans(mat, true); + cand.computeMeanSquaredDeviation(mat); + } + + // Step 4: try adding rows. + cand.visitColumn(mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (!selrow); + if(cand.computeRowResidue(mat, row, false) <= cand.residue) { + cand.selectRow(row, true); + added[1] = true; + } + return false; + } + }); + + // Step 5: try adding inverted rows. + if(useinverted) { + cand.visitColumn(mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (!selrow); + if(cand.computeRowResidue(mat, row, true) <= cand.residue) { + cand.selectRow(row, true); + cand.invertRow(row, true); + added[1] = true; + } + return false; + } + }); + } + if(added[1]) { + cand.updateRowAndColumnMeans(mat, true); + cand.computeMeanSquaredDeviation(mat); + if(LOG.isDebuggingFine()) { + LOG.debugFine("Residue in Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); + } + } + if(!added[0] && !added[1]) { + break; + } + } + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <V> Vector type + */ + public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer { + /** + * Parameter to specify the distribution of replacement values when masking + * a cluster. + */ + public static final OptionID DIST_ID = new OptionID("chengandchurch.replacement", "Distribution of replacement values when masking found clusters."); + + /** + * Threshold value to determine the maximal acceptable score (mean squared + * residue) of a bicluster. + * <p/> + * Key: {@code -chengandchurch.delta} + * </p> + */ + public static final OptionID DELTA_ID = new OptionID("chengandchurch.delta", "Threshold value to determine the maximal acceptable score (mean squared residue) of a bicluster."); + + /** + * Parameter for multiple node deletion to accelerate the algorithm. (>= + * 1) + * <p/> + * Key: {@code -chengandchurch.alpha} + * </p> + */ + public static final OptionID ALPHA_ID = new OptionID("chengandchurch.alpha", "Parameter for multiple node deletion to accelerate the algorithm."); + + /** + * Number of biclusters to be found. + * <p/> + * Default value: 1 + * </p> + * <p/> + * Key: {@code -chengandchurch.n} + * </p> + */ + public static final OptionID N_ID = new OptionID("chengandchurch.n", "The number of biclusters to be found."); + + /** + * Threshold for the score ({@link #DELTA_ID}). + */ + private double delta; + + /** + * The parameter for multiple node deletion.</p> + * <p> + * It is used to magnify the {@link #delta} value in the + * {@link ChengAndChurch#multipleNodeDeletion} method. + * </p> + */ + private double alpha; + + /** + * Number of biclusters to be found. + */ + private int n; + + /** + * Distribution of replacement values. + */ + private Distribution dist; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + DoubleParameter deltaP = new DoubleParameter(DELTA_ID); + if(config.grab(deltaP)) { + delta = deltaP.doubleValue(); + } + deltaP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE); + + IntParameter nP = new IntParameter(N_ID, 1); + nP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(nP)) { + n = nP.intValue(); + } + + DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 1.); + alphaP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_DOUBLE); + if(config.grab(alphaP)) { + alpha = alphaP.doubleValue(); + } + + ObjectParameter<Distribution> distP = new ObjectParameter<>(DIST_ID, Distribution.class, UniformDistribution.class); + if(config.grab(distP)) { + dist = distP.instantiateClass(config); + } + } + + @Override + protected ChengAndChurch<V> makeInstance() { + return new ChengAndChurch<>(delta, alpha, n, dist); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/package-info.java new file mode 100644 index 00000000..21363bfc --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/package-info.java @@ -0,0 +1,28 @@ +/** + * <p>Biclustering algorithms.</p> + * + * + */ +/* +This file is part of ELKI: +Environment for Developing KDD-Applications Supported by Index-Structures + +Copyright (C) 2013 +Ludwig-Maximilians-Universität München +Lehr- und Forschungseinheit für Datenbanksysteme +ELKI Development Team + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ +package de.lmu.ifi.dbs.elki.algorithm.clustering.biclustering;
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java index 0d82add9..8e5fa627 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java @@ -74,7 +74,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; @@ -838,22 +838,22 @@ public class CASH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin protected void makeOptions(Parameterization config) { super.makeOptions(config); IntParameter minptsP = new IntParameter(MINPTS_ID); - minptsP.addConstraint(new GreaterConstraint(0)); + minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if (config.grab(minptsP)) { minpts = minptsP.getValue(); } IntParameter maxlevelP = new IntParameter(MAXLEVEL_ID); - maxlevelP.addConstraint(new GreaterConstraint(0)); + maxlevelP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if (config.grab(maxlevelP)) { maxlevel = maxlevelP.getValue(); } IntParameter mindimP = new IntParameter(MINDIM_ID, 1); - mindimP.addConstraint(new GreaterConstraint(0)); + mindimP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if (config.grab(mindimP)) { mindim = mindimP.getValue(); } DoubleParameter jitterP = new DoubleParameter(JITTER_ID); - jitterP.addConstraint(new GreaterConstraint(0)); + jitterP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); if (config.grab(jitterP)) { jitter = jitterP.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java index 9a4b8512..68878aef 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java @@ -29,7 +29,7 @@ import java.util.Map; import java.util.Map.Entry; import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.DistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; @@ -270,7 +270,7 @@ public class COPAC<V extends NumberVector<?>, D extends Distance<D>> extends Abs public ClusteringAlgorithm<Clustering<Model>> getPartitionAlgorithm(DistanceQuery<V, D> query) { ListParameterization reconfig = new ListParameterization(partitionAlgorithmParameters); ProxyDistanceFunction<V, D> dist = ProxyDistanceFunction.proxy(query); - reconfig.addParameter(AbstractDistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, dist); + reconfig.addParameter(DistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, dist); ClusteringAlgorithm<Clustering<Model>> instance = reconfig.tryInstantiate(partitionAlgorithm); reconfig.failOnErrors(); return instance; @@ -335,7 +335,7 @@ public class COPAC<V extends NumberVector<?>, D extends Distance<D>> extends Abs ClassParameter<ClusteringAlgorithm<Clustering<Model>>> algP = new ClassParameter<>(PARTITION_ALGORITHM_ID, ClusteringAlgorithm.class); if(config.grab(algP)) { ListParameterization predefined = new ListParameterization(); - predefined.addParameter(AbstractDistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, pdistI); + predefined.addParameter(DistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, pdistI); TrackParameters trackpar = new TrackParameters(config); ChainedParameterization chain = new ChainedParameterization(predefined, trackpar); chain.errorsTo(config); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java index d1b714bf..79ddc16e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java @@ -36,9 +36,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -162,33 +160,34 @@ public class HiCO<V extends NumberVector<?>> extends OPTICS<V, PCACorrelationDis super.makeOptions(config);
IntParameter muP = new IntParameter(MU_ID);
- muP.addConstraint(new GreaterConstraint(0));
- if (config.grab(muP)) {
+ muP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(muP)) {
mu = muP.getValue();
}
IntParameter kP = new IntParameter(K_ID);
- kP.addConstraint(new GreaterConstraint(0));
+ kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
kP.setOptional(true);
final int k;
- if (config.grab(kP)) {
+ if(config.grab(kP)) {
k = kP.getValue();
- } else {
+ }
+ else {
k = mu;
}
DoubleParameter deltaP = new DoubleParameter(DELTA_ID, DEFAULT_DELTA);
- deltaP.addConstraint(new GreaterEqualConstraint(0));
+ deltaP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
double delta = DEFAULT_DELTA;
- if (config.grab(deltaP)) {
+ if(config.grab(deltaP)) {
delta = deltaP.doubleValue();
}
DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, DEFAULT_ALPHA);
- alphaP.addConstraint(new GreaterConstraint(0.0)); - alphaP.addConstraint(new LessConstraint(1.0));
+ alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ alphaP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
double alpha = DEFAULT_ALPHA;
- if (config.grab(alphaP)) {
+ if(config.grab(alphaP)) {
alpha = alphaP.doubleValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java index f9531be0..99144b42 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java @@ -55,7 +55,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -116,7 +116,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { * Number of sampling rounds to find a good split */ private final int samplingLevel; - + /** * Random factory */ @@ -163,34 +163,34 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Clustered objects", relation.size(), LOG) : null; IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Clusters found", LOG) : null; ModifiableDBIDs unclustered = DBIDUtil.newHashSet(relation.getDBIDs()); - Random r = rnd.getRandom(); + Random r = rnd.getSingleThreadedRandom(); final int maxdim = Math.min(maxLMDim, RelationUtil.dimensionality(relation)); int cnum = 0; - while (unclustered.size() > minsize) { + while(unclustered.size() > minsize) { DBIDs current = unclustered; int lmDim = 1; - for (int k = 1; k <= maxdim; k++) { + for(int k = 1; k <= maxdim; k++) { // Implementation note: this while loop is from the original publication // and the published LMCLUS source code. It doesn't make sense to me - // it is lacking a stop criterion other than "cluster is too small" and // "cluster is inseparable"! Additionally, there is good criterion for // stopping at the appropriate dimensionality either. - while (true) { + while(true) { Separation separation = findSeparation(relation, current, k, r); // logger.verbose("k: " + k + " goodness: " + separation.goodness + // " threshold: " + separation.threshold); - if (separation.goodness <= sensitivityThreshold) { + if(separation.goodness <= sensitivityThreshold) { break; } ModifiableDBIDs subset = DBIDUtil.newArray(current.size()); - for (DBIDIter iter = current.iter(); iter.valid(); iter.advance()) { - if (deviation(relation.get(iter).getColumnVector().minusEquals(separation.originV), separation.basis) < separation.threshold) { + for(DBIDIter iter = current.iter(); iter.valid(); iter.advance()) { + if(deviation(relation.get(iter).getColumnVector().minusEquals(separation.originV), separation.basis) < separation.threshold) { subset.add(iter); } } // logger.verbose("size:"+subset.size()); - if (subset.size() < minsize) { + if(subset.size() < minsize) { break; } current = subset; @@ -199,7 +199,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { } } // No more clusters found - if (current.size() < minsize || current == unclustered) { + if(current.size() < minsize || current == unclustered) { break; } // New cluster found @@ -210,22 +210,22 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { ret.addToplevelCluster(cluster); // Remove from main working set. unclustered.removeDBIDs(current); - if (progress != null) { + if(progress != null) { progress.setProcessed(relation.size() - unclustered.size(), LOG); } - if (cprogress != null) { + if(cprogress != null) { cprogress.setProcessed(cnum, LOG); } } // Remaining objects are noise - if (unclustered.size() > 0) { + if(unclustered.size() > 0) { ret.addToplevelCluster(new Cluster<>(unclustered, true)); } - if (progress != null) { + if(progress != null) { progress.setProcessed(relation.size(), LOG); progress.ensureCompleted(LOG); } - if (cprogress != null) { + if(cprogress != null) { cprogress.setCompleted(LOG); } return ret; @@ -272,7 +272,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { int samples = (int) Math.min(Math.log(NOT_FROM_ONE_CLUSTER_PROBABILITY) / (Math.log(1 - Math.pow((1.0d / samplingLevel), dimension))), (double) currentids.size()); // System.out.println("Number of samples: " + samples); int remaining_retries = 100; - for (int i = 1; i <= samples; i++) { + for(int i = 1; i <= samples; i++) { DBIDs sample = DBIDUtil.randomSample(currentids, dimension + 1, r.nextLong()); final DBIDIter iter = sample.iter(); // Use first as origin @@ -282,17 +282,17 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { Matrix basis; { List<Vector> vectors = new ArrayList<>(sample.size() - 1); - for (; iter.valid(); iter.advance()) { + for(; iter.valid(); iter.advance()) { Vector vec = relation.get(iter).getColumnVector(); vectors.add(vec.minusEquals(originV)); } // generate orthogonal basis basis = generateOrthonormalBasis(vectors); - if (basis == null) { + if(basis == null) { // new sample has to be taken. i--; remaining_retries--; - if (remaining_retries < 0) { + if(remaining_retries < 0) { throw new AbortException("Too many retries in sampling, and always a linear dependant data set."); } continue; @@ -301,9 +301,9 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { // Generate and fill a histogram. DoubleDynamicHistogram histogram = new DoubleDynamicHistogram(BINS); double w = 1.0 / currentids.size(); - for (DBIDIter iter2 = currentids.iter(); iter2.valid(); iter2.advance()) { + for(DBIDIter iter2 = currentids.iter(); iter2.valid(); iter2.advance()) { // Skip sampled points - if (sample.contains(iter2)) { + if(sample.contains(iter2)) { continue; } Vector vec = relation.get(iter2).getColumnVector().minusEquals(originV); @@ -311,7 +311,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { histogram.increment(distance, w); } double[] th = findAndEvaluateThreshold(histogram); // evaluate threshold - if (th[1] > separation.goodness) { + if(th[1] > separation.goodness) { separation.goodness = th[1]; separation.threshold = th[0]; separation.originV = originV; @@ -341,16 +341,16 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { first = first.times(1.0 / first.euclideanLength()); Matrix ret = new Matrix(first.getDimensionality(), vectors.size()); ret.setCol(0, first); - for (int i = 1; i < vectors.size(); i++) { + for(int i = 1; i < vectors.size(); i++) { // System.out.println("Matrix:" + ret); Vector v_i = vectors.get(i); Vector u_i = v_i.copy(); // System.out.println("Vector " + i + ":" + partialSol); - for (int j = 0; j < i; j++) { + for(int j = 0; j < i; j++) { Vector v_j = ret.getCol(j); double f = v_i.transposeTimes(v_j) / v_j.transposeTimes(v_j); - if (Double.isNaN(f)) { - if (LOG.isDebuggingFine()) { + if(Double.isNaN(f)) { + if(LOG.isDebuggingFine()) { LOG.debugFine("Zero vector encountered? " + v_j); } return null; @@ -359,8 +359,8 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { } // check if the vectors weren't independent final double len_u_i = u_i.euclideanLength(); - if (len_u_i == 0.0) { - if (LOG.isDebuggingFine()) { + if(len_u_i == 0.0) { + if(LOG.isDebuggingFine()) { LOG.debugFine("Points not independent - no orthonormalization."); } return null; @@ -391,7 +391,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { { MeanVariance mv = new MeanVariance(); DoubleHistogram.Iter forward = histogram.iter(); - for (int i = 0; forward.valid(); i++, forward.advance()) { + for(int i = 0; forward.valid(); i++, forward.advance()) { p1[i] = forward.getValue() + ((i > 0) ? p1[i - 1] : 0); mv.put(i, forward.getValue()); mu1[i] = mv.getMean(); @@ -404,7 +404,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { DoubleHistogram.Iter backwards = histogram.iter(); backwards.seek(histogram.getNumBins() - 1); // Seek to last - for (int j = n - 1; backwards.valid(); j--, backwards.retract()) { + for(int j = n - 1; backwards.valid(); j--, backwards.retract()) { p2[j] = backwards.getValue() + ((j + 1 < n) ? p2[j + 1] : 0); mv.put(j, backwards.getValue()); mu2[j] = mv.getMean(); @@ -412,7 +412,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { } } - for (int i = 0; i < n; i++) { + for(int i = 0; i < n; i++) { jt[i] = 1.0 + 2 * (p1[i] * (Math.log(sigma1[i]) - Math.log(p1[i])) + p2[i] * (Math.log(sigma2[i]) - Math.log(p2[i]))); } @@ -420,23 +420,23 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { double bestgoodness = Double.NEGATIVE_INFINITY; double devPrev = jt[1] - jt[0]; - for (int i = 1; i < jt.length - 1; i++) { + for(int i = 1; i < jt.length - 1; i++) { double devCur = jt[i + 1] - jt[i]; // System.out.println(p1[i]); // System.out.println(jt[i + 1]); // System.out.println(jt[i]); // System.out.println(devCur); // Local minimum found - calculate depth - if (devCur >= 0 && devPrev <= 0) { + if(devCur >= 0 && devPrev <= 0) { double lowestMaxima = Double.POSITIVE_INFINITY; - for (int j = i - 1; j > 0; j--) { - if (jt[j - 1] < jt[j]) { + for(int j = i - 1; j > 0; j--) { + if(jt[j - 1] < jt[j]) { lowestMaxima = Math.min(lowestMaxima, jt[j]); break; } } - for (int j = i + 1; j < n - 2; j++) { - if (jt[j + 1] < jt[j]) { + for(int j = i + 1; j < n - 2; j++) { + if(jt[j + 1] < jt[j]) { lowestMaxima = Math.min(lowestMaxima, jt[j]); break; } @@ -445,11 +445,11 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { final double mud = mu1[i] - mu2[i]; double discriminability = mud * mud / (sigma1[i] * sigma1[i] + sigma2[i] * sigma2[i]); - if (Double.isNaN(discriminability)) { + if(Double.isNaN(discriminability)) { discriminability = -1; } double goodness = localDepth * discriminability; - if (goodness > bestgoodness) { + if(goodness > bestgoodness) { bestgoodness = goodness; bestpos = i; } @@ -552,7 +552,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { * Threshold */ private double threshold; - + /** * Random generator */ @@ -562,26 +562,26 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { protected void makeOptions(Parameterization config) { super.makeOptions(config); IntParameter maxLMDimP = new IntParameter(MAXDIM_ID); - maxLMDimP.addConstraint(new GreaterEqualConstraint(1)); + maxLMDimP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); maxLMDimP.setOptional(true); - if (config.grab(maxLMDimP)) { + if(config.grab(maxLMDimP)) { maxdim = maxLMDimP.getValue(); } IntParameter minsizeP = new IntParameter(MINSIZE_ID); - minsizeP.addConstraint(new GreaterEqualConstraint(1)); - if (config.grab(minsizeP)) { + minsizeP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(minsizeP)) { minsize = minsizeP.getValue(); } IntParameter samplingLevelP = new IntParameter(SAMPLINGL_ID, 100); - if (config.grab(samplingLevelP)) { + if(config.grab(samplingLevelP)) { samplingLevel = samplingLevelP.getValue(); } DoubleParameter sensivityThresholdP = new DoubleParameter(THRESHOLD_ID); - if (config.grab(sensivityThresholdP)) { + if(config.grab(sensivityThresholdP)) { threshold = sensivityThresholdP.getValue(); } RandomParameter rndP = new RandomParameter(RANDOM_ID); - if (config.grab(rndP)) { + if(config.grab(rndP)) { rnd = rndP.getValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java index a9c67a58..7733ddaa 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java @@ -61,8 +61,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; @@ -135,7 +134,7 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri // current dimensionality associated with each seed int dim_c = RelationUtil.dimensionality(relation); - if (dim_c < l) { + if(dim_c < l) { throw new IllegalStateException("Dimensionality of data < parameter l! " + "(" + dim_c + " < " + l + ")"); } @@ -149,8 +148,8 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Current number of clusters:", LOG) : null; - while (k_c > k) { - if (cprogress != null) { + while(k_c > k) { + if(cprogress != null) { cprogress.setProcessed(clusters.size(), LOG); } @@ -158,8 +157,8 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri assign(relation, distFunc, clusters); // determine current subspace associated with each cluster - for (ORCLUSCluster cluster : clusters) { - if (cluster.objectIDs.size() > 0) { + for(ORCLUSCluster cluster : clusters) { + if(cluster.objectIDs.size() > 0) { cluster.basis = findBasis(relation, distFunc, cluster, dim_c); } } @@ -172,18 +171,19 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri } assign(relation, distFunc, clusters); - if (cprogress != null) { + if(cprogress != null) { cprogress.setProcessed(clusters.size()); cprogress.setCompleted(LOG); } // get the result Clustering<Model> r = new Clustering<>("ORCLUS clustering", "orclus-clustering"); - for (ORCLUSCluster c : clusters) { + for(ORCLUSCluster c : clusters) { r.addToplevelCluster(new Cluster<Model>(c.objectIDs, ClusterModel.CLUSTER)); } return r; - } catch (Exception e) { + } + catch(Exception e) { throw new IllegalStateException(e); } } @@ -199,7 +199,7 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri DBIDs randomSample = DBIDUtil.randomSample(database.getDBIDs(), k, rnd); NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(database); List<ORCLUSCluster> seeds = new ArrayList<>(); - for (DBIDIter iter = randomSample.iter(); iter.valid(); iter.advance()) { + for(DBIDIter iter = randomSample.iter(); iter.valid(); iter.advance()) { seeds.add(new ORCLUSCluster(database.get(iter), iter, factory)); } return seeds; @@ -217,29 +217,29 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri private void assign(Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, List<ORCLUSCluster> clusters) { NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(database); // clear the current clusters - for (ORCLUSCluster cluster : clusters) { + for(ORCLUSCluster cluster : clusters) { cluster.objectIDs.clear(); } // projected centroids of the clusters List<V> projectedCentroids = new ArrayList<>(clusters.size()); - for (ORCLUSCluster c : clusters) { + for(ORCLUSCluster c : clusters) { projectedCentroids.add(projection(c, c.centroid, factory)); } // for each data point o do - for (DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { + for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { V o = database.get(it); DoubleDistance minDist = null; ORCLUSCluster minCluster = null; // determine projected distance between o and cluster - for (int i = 0; i < clusters.size(); i++) { + for(int i = 0; i < clusters.size(); i++) { ORCLUSCluster c = clusters.get(i); V o_proj = projection(c, o, factory); DoubleDistance dist = distFunc.distance(o_proj, projectedCentroids.get(i)); - if (minDist == null || minDist.compareTo(dist) > 0) { + if(minDist == null || minDist.compareTo(dist) > 0) { minDist = dist; minCluster = c; } @@ -250,8 +250,8 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri } // recompute the seed in each clusters - for (ORCLUSCluster cluster : clusters) { - if (cluster.objectIDs.size() > 0) { + for(ORCLUSCluster cluster : clusters) { + if(cluster.objectIDs.size() > 0) { cluster.centroid = Centroid.make(database, cluster.objectIDs).toVector(database); } } @@ -271,7 +271,7 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri // covariance matrix of cluster // Matrix covariance = Util.covarianceMatrix(database, cluster.objectIDs); GenericDistanceDBIDList<DoubleDistance> results = new GenericDistanceDBIDList<>(cluster.objectIDs.size()); - for (DBIDIter it = cluster.objectIDs.iter(); it.valid(); it.advance()) { + for(DBIDIter it = cluster.objectIDs.iter(); it.valid(); it.advance()) { DoubleDistance distance = distFunc.distance(cluster.centroid, database.get(it)); results.add(distance, it); } @@ -304,9 +304,9 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri */ private void merge(Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, List<ORCLUSCluster> clusters, int k_new, int d_new, IndefiniteProgress cprogress) { ArrayList<ProjectedEnergy> projectedEnergies = new ArrayList<>(); - for (int i = 0; i < clusters.size(); i++) { - for (int j = 0; j < clusters.size(); j++) { - if (i >= j) { + for(int i = 0; i < clusters.size(); i++) { + for(int j = 0; j < clusters.size(); j++) { + if(i >= j) { continue; } // projected energy of c_ij in subspace e_ij @@ -318,8 +318,8 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri } } - while (clusters.size() > k_new) { - if (cprogress != null) { + while(clusters.size() > k_new) { + if(cprogress != null) { cprogress.setProcessed(clusters.size(), LOG); } // find the smallest value of r_ij @@ -327,12 +327,12 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri // renumber the clusters by replacing cluster c_i with cluster c_ij // and discarding cluster c_j - for (int c = 0; c < clusters.size(); c++) { - if (c == minPE.i) { + for(int c = 0; c < clusters.size(); c++) { + if(c == minPE.i) { clusters.remove(c); clusters.add(c, minPE.cluster); } - if (c == minPE.j) { + if(c == minPE.j) { clusters.remove(c); } } @@ -341,15 +341,16 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri int i = minPE.i; int j = minPE.j; Iterator<ProjectedEnergy> it = projectedEnergies.iterator(); - while (it.hasNext()) { + while(it.hasNext()) { ProjectedEnergy pe = it.next(); - if (pe.i == i || pe.i == j || pe.j == i || pe.j == j) { + if(pe.i == i || pe.i == j || pe.j == i || pe.j == j) { it.remove(); - } else { - if (pe.i > j) { + } + else { + if(pe.i > j) { pe.i -= 1; } - if (pe.j > j) { + if(pe.j > j) { pe.j -= 1; } } @@ -357,10 +358,11 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri // ... and recompute them ORCLUSCluster c_ij = minPE.cluster; - for (int c = 0; c < clusters.size(); c++) { - if (c < i) { + for(int c = 0; c < clusters.size(); c++) { + if(c < i) { projectedEnergies.add(projectedEnergy(database, distFunc, clusters.get(c), c_ij, c, i, d_new)); - } else if (c > i) { + } + else if(c > i) { projectedEnergies.add(projectedEnergy(database, distFunc, clusters.get(c), c_ij, i, c, d_new)); } } @@ -389,7 +391,7 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri double sum = 0.; V c_proj = projection(c_ij, c_ij.centroid, factory); - for (DBIDIter iter = c_ij.objectIDs.iter(); iter.valid(); iter.advance()) { + for(DBIDIter iter = c_ij.objectIDs.iter(); iter.valid(); iter.advance()) { V o_proj = projection(c_ij, database.get(iter), factory); double dist = distFunc.distance(o_proj, c_proj).doubleValue(); sum += dist * dist; @@ -417,15 +419,16 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri // convert into array. c.objectIDs = DBIDUtil.newArray(c.objectIDs); - if (c.objectIDs.size() > 0) { + if(c.objectIDs.size() > 0) { c.centroid = Centroid.make(relation, c.objectIDs).toVector(relation); c.basis = findBasis(relation, distFunc, c, dim); - } else { + } + else { NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); Vector cent = c1.centroid.getColumnVector().plusEquals(c2.centroid.getColumnVector()).timesEquals(0.5); c.centroid = factory.newNumberVector(cent.getArrayRef()); double[][] doubles = new double[c1.basis.getRowDimensionality()][dim]; - for (int i = 0; i < dim; i++) { + for(int i = 0; i < dim; i++) { doubles[i][i] = 1; } c.basis = new Matrix(doubles); @@ -590,16 +593,16 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri protected void configAlpha(Parameterization config) { DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 0.5); - alphaP.addConstraint(new GreaterConstraint(0)); - alphaP.addConstraint(new LessEqualConstraint(1)); - if (config.grab(alphaP)) { + alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); + alphaP.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE); + if(config.grab(alphaP)) { alpha = alphaP.doubleValue(); } } protected void configSeed(Parameterization config) { RandomParameter rndP = new RandomParameter(SEED_ID); - if (config.grab(rndP)) { + if(config.grab(rndP)) { rnd = rndP.getValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java index 545a8171..1b316c7c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java @@ -23,7 +23,8 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan; along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.DistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN; import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; @@ -67,12 +68,12 @@ public class EpsilonNeighborPredicate<O, D extends Distance<D>> implements Neigh /** * Range to query with */ - D epsilon; + protected D epsilon; /** * Distance function to use */ - DistanceFunction<O, D> distFunc; + protected DistanceFunction<O, D> distFunc; /** * Full constructor. @@ -177,14 +178,14 @@ public class EpsilonNeighborPredicate<O, D extends Distance<D>> implements Neigh protected void makeOptions(Parameterization config) { super.makeOptions(config); // Get a distance function. - ObjectParameter<DistanceFunction<O, D>> distanceP = new ObjectParameter<>(AbstractDistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, DistanceFunction.class, EuclideanDistanceFunction.class); + ObjectParameter<DistanceFunction<O, D>> distanceP = new ObjectParameter<>(DistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, DistanceFunction.class, EuclideanDistanceFunction.class); D distanceFactory = null; if(config.grab(distanceP)) { distfun = distanceP.instantiateClass(config); distanceFactory = distfun.getDistanceFactory(); } // Get the epsilon parameter - DistanceParameter<D> epsilonP = new DistanceParameter<>(de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN.EPSILON_ID, distanceFactory); + DistanceParameter<D> epsilonP = new DistanceParameter<>(DBSCAN.Parameterizer.EPSILON_ID, distanceFactory); if(config.grab(epsilonP)) { epsilon = epsilonP.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java index a6e62e2e..ac7ba81d 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java @@ -23,6 +23,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan; along with this program. If not, see <http://www.gnu.org/licenses/>. */ +import de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN; import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; @@ -54,7 +55,7 @@ public class MinPtsCorePredicate implements CorePredicate { /** * The minpts parameter. */ - int minpts; + protected int minpts; /** * Default constructor. @@ -127,7 +128,7 @@ public class MinPtsCorePredicate implements CorePredicate { protected void makeOptions(Parameterization config) { super.makeOptions(config); // Get the minpts parameter - IntParameter minptsP = new IntParameter(de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN.MINPTS_ID); + IntParameter minptsP = new IntParameter(DBSCAN.Parameterizer.MINPTS_ID); if(config.grab(minptsP)) { minpts = minptsP.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java index ac5cb77c..f6dbc88f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java @@ -55,7 +55,7 @@ import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.EnumParameter; @@ -178,9 +178,10 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement DataStore<D> lambda = pointerresult.getParentDistanceStore(); Clustering<DendrogramModel<D>> result; - if (lambda instanceof DoubleDistanceDataStore) { + if(lambda instanceof DoubleDistanceDataStore) { result = extractClustersDouble(ids, pi, (DoubleDistanceDataStore) lambda); - } else { + } + else { result = extractClusters(ids, pi, lambda); } result.addChildResult(pointerresult); @@ -208,28 +209,31 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement DBIDArrayIter it = order.iter(); // Used multiple times! int split; - if (minclusters > 0) { + if(minclusters > 0) { split = Math.max(ids.size() - minclusters, 0); // Stop distance: final D stopdist = lambda.get(order.get(split)); // Tie handling: decrement split. - while (split > 0) { + while(split > 0) { it.seek(split - 1); - if (stopdist.compareTo(lambda.get(it)) <= 0) { + if(stopdist.compareTo(lambda.get(it)) <= 0) { split--; - } else { + } + else { break; } } - } else if (threshold != null) { + } + else if(threshold != null) { split = ids.size(); it.seek(split - 1); - while (threshold.compareTo(lambda.get(it)) <= 0 && it.valid()) { + while(threshold.compareTo(lambda.get(it)) <= 0 && it.valid()) { split--; it.retract(); } - } else { // full hierarchy + } + else { // full hierarchy split = 0; } @@ -242,19 +246,20 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement DBIDVar succ = DBIDUtil.newVar(); // Variable for successor. // Go backwards on the lower part. - for (it.seek(split - 1); it.valid(); it.retract()) { + for(it.seek(split - 1); it.valid(); it.retract()) { D dist = lambda.get(it); // Distance to successor pi.assignVar(it, succ); // succ = pi(it) int clusterid = cluster_map.intValue(succ); // Successor cluster has already been created: - if (clusterid >= 0) { + if(clusterid >= 0) { cluster_dbids.get(clusterid).add(it); cluster_map.putInt(it, clusterid); // Update distance to maximum encountered: - if (cluster_dist.get(clusterid).compareTo(dist) < 0) { + if(cluster_dist.get(clusterid).compareTo(dist) < 0) { cluster_dist.set(clusterid, dist); } - } else { + } + else { // Need to start a new cluster: clusterid = cluster_dbids.size(); // next cluster number. ModifiableDBIDs cids = DBIDUtil.newArray(); @@ -270,12 +275,12 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement } // Decrement counter - if (progress != null) { + if(progress != null) { progress.incrementProcessed(LOG); } } final Clustering<DendrogramModel<D>> dendrogram; - switch(outputmode) { + switch(outputmode){ case PARTIAL_HIERARCHY: { // Build a hierarchy out of these clusters. dendrogram = new Clustering<>("Hierarchical Clustering", "hierarchical-clustering"); @@ -284,74 +289,81 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement // Convert initial clusters to cluster objects { int i = 0; - for (DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { + for(DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { clusters.add(makeCluster(it2, cluster_dist.get(i), cluster_dbids.get(i))); } cluster_dist = null; // Invalidate cluster_dbids = null; // Invalidate } // Process the upper part, bottom-up. - for (it.seek(split); it.valid(); it.advance()) { + for(it.seek(split); it.valid(); it.advance()) { int clusterid = cluster_map.intValue(it); // The current cluster led by the current element: final Cluster<DendrogramModel<D>> clus; - if (clusterid >= 0) { + if(clusterid >= 0) { clus = clusters.get(clusterid); - } else if (!singletons && ids.size() != 1) { + } + else if(!singletons && ids.size() != 1) { clus = null; - } else { + } + else { clus = makeCluster(it, null, DBIDUtil.deref(it)); } // The successor to join: pi.assignVar(it, succ); // succ = pi(it) - if (DBIDUtil.equal(it, succ)) { + if(DBIDUtil.equal(it, succ)) { assert (root == null); root = clus; - } else { + } + else { // Parent cluster: int parentid = cluster_map.intValue(succ); D depth = lambda.get(it); // Parent cluster exists - merge as a new cluster: - if (parentid >= 0) { + if(parentid >= 0) { final Cluster<DendrogramModel<D>> pclus = clusters.get(parentid); - if (pclus.getModel().getDistance().equals(depth)) { - if (clus == null) { + if(pclus.getModel().getDistance().equals(depth)) { + if(clus == null) { ((ModifiableDBIDs) pclus.getIDs()).add(it); - } else { + } + else { dendrogram.addChildCluster(pclus, clus); } - } else { + } + else { // Merge at new depth: ModifiableDBIDs cids = DBIDUtil.newArray(clus == null ? 1 : 0); - if (clus == null) { + if(clus == null) { cids.add(it); } Cluster<DendrogramModel<D>> npclus = makeCluster(succ, depth, cids); - if (clus != null) { + if(clus != null) { dendrogram.addChildCluster(npclus, clus); } dendrogram.addChildCluster(npclus, pclus); // Replace existing parent cluster: new depth clusters.set(parentid, npclus); } - } else { + } + else { // Merge with parent at this depth: final Cluster<DendrogramModel<D>> pclus; - if (!singletons) { + if(!singletons) { ModifiableDBIDs cids = DBIDUtil.newArray(clus == null ? 2 : 1); cids.add(succ); - if (clus == null) { + if(clus == null) { cids.add(it); } // New cluster for parent and/or new point pclus = makeCluster(succ, depth, cids); - } else { + } + else { // Create a new, one-element cluster for parent, and a merged // cluster on top. pclus = makeCluster(succ, depth, DBIDUtil.EMPTYDBIDS); dendrogram.addChildCluster(pclus, makeCluster(succ, null, DBIDUtil.deref(succ))); } - if (clus != null) { + if(clus != null) { dendrogram.addChildCluster(pclus, clus); } // Store cluster: @@ -362,7 +374,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement } // Decrement counter - if (progress != null) { + if(progress != null) { progress.incrementProcessed(LOG); } } @@ -377,21 +389,21 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement // Convert initial clusters to cluster objects { int i = 0; - for (DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { + for(DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { dendrogram.addToplevelCluster(makeCluster(it2, cluster_dist.get(i), cluster_dbids.get(i))); } cluster_dist = null; // Invalidate cluster_dbids = null; // Invalidate } // Process the upper part, bottom-up. - for (it.seek(split); it.valid(); it.advance()) { + for(it.seek(split); it.valid(); it.advance()) { int clusterid = cluster_map.intValue(it); - if (clusterid < 0) { + if(clusterid < 0) { dendrogram.addToplevelCluster(makeCluster(it, null, DBIDUtil.deref(it))); } // Decrement counter - if (progress != null) { + if(progress != null) { progress.incrementProcessed(LOG); } } @@ -401,7 +413,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement throw new AbortException("Unsupported output mode."); } - if (progress != null) { + if(progress != null) { progress.ensureCompleted(LOG); } @@ -428,29 +440,32 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement DBIDArrayIter it = order.iter(); // Used multiple times! int split; - if (minclusters > 0) { + if(minclusters > 0) { split = Math.max(ids.size() - minclusters, 0); // Stop distance: final double stopdist = lambda.doubleValue(order.get(split)); // Tie handling: decrement split. - while (split > 0) { + while(split > 0) { it.seek(split - 1); - if (stopdist <= lambda.doubleValue(it)) { + if(stopdist <= lambda.doubleValue(it)) { split--; - } else { + } + else { break; } } - } else if (threshold != null) { + } + else if(threshold != null) { split = ids.size(); it.seek(split - 1); double stopdist = ((DoubleDistance) threshold).doubleValue(); - while (stopdist <= lambda.doubleValue(it) && it.valid()) { + while(stopdist <= lambda.doubleValue(it) && it.valid()) { split--; it.retract(); } - } else { // full hierarchy + } + else { // full hierarchy split = 0; } @@ -463,19 +478,20 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement DBIDVar succ = DBIDUtil.newVar(); // Variable for successor. // Go backwards on the lower part. - for (it.seek(split - 1); it.valid(); it.retract()) { + for(it.seek(split - 1); it.valid(); it.retract()) { double dist = lambda.doubleValue(it); // Distance to successor pi.assignVar(it, succ); // succ = pi(it) int clusterid = cluster_map.intValue(succ); // Successor cluster has already been created: - if (clusterid >= 0) { + if(clusterid >= 0) { cluster_dbids.get(clusterid).add(it); cluster_map.putInt(it, clusterid); // Update distance to maximum encountered: - if (cluster_dist.get(clusterid) < dist) { + if(cluster_dist.get(clusterid) < dist) { cluster_dist.set(clusterid, dist); } - } else { + } + else { // Need to start a new cluster: clusterid = cluster_dbids.size(); // next cluster number. ModifiableDBIDs cids = DBIDUtil.newArray(); @@ -491,12 +507,12 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement } // Decrement counter - if (progress != null) { + if(progress != null) { progress.incrementProcessed(LOG); } } final Clustering<DendrogramModel<D>> dendrogram; - switch(outputmode) { + switch(outputmode){ case PARTIAL_HIERARCHY: { // Build a hierarchy out of these clusters. dendrogram = new Clustering<>("Hierarchical Clustering", "hierarchical-clustering"); @@ -505,7 +521,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement // Convert initial clusters to cluster objects { int i = 0; - for (DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { + for(DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { @SuppressWarnings("unchecked") D depth = (D) new DoubleDistance(cluster_dist.get(i)); clusters.add(makeCluster(it2, depth, cluster_dbids.get(i))); @@ -514,68 +530,75 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement cluster_dbids = null; // Invalidate } // Process the upper part, bottom-up. - for (it.seek(split); it.valid(); it.advance()) { + for(it.seek(split); it.valid(); it.advance()) { int clusterid = cluster_map.intValue(it); // The current cluster led by the current element: final Cluster<DendrogramModel<D>> clus; - if (clusterid >= 0) { + if(clusterid >= 0) { clus = clusters.get(clusterid); - } else if (!singletons && ids.size() != 1) { + } + else if(!singletons && ids.size() != 1) { clus = null; - } else { + } + else { clus = makeCluster(it, null, DBIDUtil.deref(it)); } // The successor to join: pi.assignVar(it, succ); // succ = pi(it) - if (DBIDUtil.equal(it, succ)) { + if(DBIDUtil.equal(it, succ)) { assert (root == null); root = clus; - } else { + } + else { // Parent cluster: int parentid = cluster_map.intValue(succ); @SuppressWarnings("unchecked") D depth = (D) new DoubleDistance(lambda.doubleValue(it)); // Parent cluster exists - merge as a new cluster: - if (parentid >= 0) { + if(parentid >= 0) { final Cluster<DendrogramModel<D>> pclus = clusters.get(parentid); - if (pclus.getModel().getDistance().equals(depth)) { - if (clus == null) { + if(pclus.getModel().getDistance().equals(depth)) { + if(clus == null) { ((ModifiableDBIDs) pclus.getIDs()).add(it); - } else { + } + else { dendrogram.addChildCluster(pclus, clus); } - } else { + } + else { // Merge at new depth: ModifiableDBIDs cids = DBIDUtil.newArray(clus == null ? 1 : 0); - if (clus == null) { + if(clus == null) { cids.add(it); } Cluster<DendrogramModel<D>> npclus = makeCluster(succ, depth, cids); - if (clus != null) { + if(clus != null) { dendrogram.addChildCluster(npclus, clus); } dendrogram.addChildCluster(npclus, pclus); // Replace existing parent cluster: new depth clusters.set(parentid, npclus); } - } else { + } + else { // Merge with parent at this depth: final Cluster<DendrogramModel<D>> pclus; - if (!singletons) { + if(!singletons) { ModifiableDBIDs cids = DBIDUtil.newArray(clus == null ? 2 : 1); cids.add(succ); - if (clus == null) { + if(clus == null) { cids.add(it); } // New cluster for parent and/or new point pclus = makeCluster(succ, depth, cids); - } else { + } + else { // Create a new, one-element cluster for parent, and a merged // cluster on top. pclus = makeCluster(succ, depth, DBIDUtil.EMPTYDBIDS); dendrogram.addChildCluster(pclus, makeCluster(succ, null, DBIDUtil.deref(succ))); } - if (clus != null) { + if(clus != null) { dendrogram.addChildCluster(pclus, clus); } // Store cluster: @@ -586,7 +609,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement } // Decrement counter - if (progress != null) { + if(progress != null) { progress.incrementProcessed(LOG); } } @@ -601,7 +624,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement // Convert initial clusters to cluster objects { int i = 0; - for (DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { + for(DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { @SuppressWarnings("unchecked") D depth = (D) new DoubleDistance(cluster_dist.get(i)); dendrogram.addToplevelCluster(makeCluster(it2, depth, cluster_dbids.get(i))); @@ -610,14 +633,14 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement cluster_dbids = null; // Invalidate } // Process the upper part, bottom-up. - for (it.seek(split); it.valid(); it.advance()) { + for(it.seek(split); it.valid(); it.advance()) { int clusterid = cluster_map.intValue(it); - if (clusterid < 0) { + if(clusterid < 0) { dendrogram.addToplevelCluster(makeCluster(it, null, DBIDUtil.deref(it))); } // Decrement counter - if (progress != null) { + if(progress != null) { progress.incrementProcessed(LOG); } } @@ -627,7 +650,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement throw new AbortException("Unsupported output mode."); } - if (progress != null) { + if(progress != null) { progress.ensureCompleted(LOG); } @@ -644,13 +667,16 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement */ private Cluster<DendrogramModel<D>> makeCluster(DBIDRef lead, D depth, DBIDs members) { final String name; - if (members.size() == 0) { + if(members.size() == 0) { name = "mrg_" + DBIDUtil.toString(lead) + "_" + depth; - } else if (depth != null && depth.isInfiniteDistance() || (members.size() == 1 && members.contains(lead))) { + } + else if(depth != null && depth.isInfiniteDistance() || (members.size() == 1 && members.contains(lead))) { name = "obj_" + DBIDUtil.toString(lead); - } else if (depth != null) { + } + else if(depth != null) { name = "clu_" + DBIDUtil.toString(lead) + "_" + depth; - } else { + } + else { // Complete data set only? name = "clu_" + DBIDUtil.toString(lead); } @@ -794,53 +820,54 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement protected void makeOptions(Parameterization config) { super.makeOptions(config); ObjectParameter<HierarchicalClusteringAlgorithm<D>> algorithmP = new ObjectParameter<>(AlgorithmStep.Parameterizer.ALGORITHM_ID, HierarchicalClusteringAlgorithm.class); - if (config.grab(algorithmP)) { + if(config.grab(algorithmP)) { algorithm = algorithmP.instantiateClass(config); } EnumParameter<ThresholdMode> modeP = new EnumParameter<>(MODE_ID, ThresholdMode.class, ThresholdMode.BY_MINCLUSTERS); - if (config.grab(modeP)) { + if(config.grab(modeP)) { thresholdmode = modeP.getValue(); } - if (thresholdmode == null || ThresholdMode.BY_MINCLUSTERS.equals(thresholdmode)) { + if(thresholdmode == null || ThresholdMode.BY_MINCLUSTERS.equals(thresholdmode)) { IntParameter minclustersP = new IntParameter(MINCLUSTERS_ID); - minclustersP.addConstraint(new GreaterEqualConstraint(1)); - if (config.grab(minclustersP)) { + minclustersP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(minclustersP)) { minclusters = minclustersP.intValue(); } } - if (thresholdmode == null || ThresholdMode.BY_THRESHOLD.equals(thresholdmode)) { + if(thresholdmode == null || ThresholdMode.BY_THRESHOLD.equals(thresholdmode)) { // Fallback to double when no algorithm chosen yet: @SuppressWarnings("unchecked") final D factory = algorithm != null ? algorithm.getDistanceFactory() : (D) DoubleDistance.FACTORY; DistanceParameter<D> distP = new DistanceParameter<>(THRESHOLD_ID, factory); - if (config.grab(distP)) { + if(config.grab(distP)) { threshold = distP.getValue(); } } - if (thresholdmode == null || !ThresholdMode.NO_THRESHOLD.equals(thresholdmode)) { + if(thresholdmode == null || !ThresholdMode.NO_THRESHOLD.equals(thresholdmode)) { EnumParameter<OutputMode> outputP = new EnumParameter<>(OUTPUTMODE_ID, OutputMode.class); - if (config.grab(outputP)) { + if(config.grab(outputP)) { outputmode = outputP.getValue(); } - } else { + } + else { // This becomes full hierarchy: minclusters = -1; outputmode = OutputMode.PARTIAL_HIERARCHY; } Flag singletonsF = new Flag(SINGLETONS_ID); - if (config.grab(singletonsF)) { + if(config.grab(singletonsF)) { singletons = singletonsF.isTrue(); } } @Override protected ExtractFlatClusteringFromHierarchy<D> makeInstance() { - switch(thresholdmode) { + switch(thresholdmode){ case NO_THRESHOLD: case BY_MINCLUSTERS: return new ExtractFlatClusteringFromHierarchy<>(algorithm, minclusters, outputmode, singletons); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java index dc1fa47c..5754e961 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java @@ -35,6 +35,7 @@ import de.lmu.ifi.dbs.elki.data.model.MeanModel; import de.lmu.ifi.dbs.elki.data.type.CombinedTypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; @@ -49,8 +50,7 @@ import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; @@ -105,68 +105,61 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan * @param relation the database to cluster * @param means a list of k means * @param clusters cluster assignment + * @param assignment Current cluster assignment * @return true when the object was reassigned */ - protected boolean assignToNearestCluster(Relation<V> relation, List<? extends NumberVector<?>> means, List<? extends ModifiableDBIDs> clusters) { + protected boolean assignToNearestCluster(Relation<V> relation, List<? extends NumberVector<?>> means, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment) { boolean changed = false; - if (getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { + if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { @SuppressWarnings("unchecked") final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction(); - for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double mindist = Double.POSITIVE_INFINITY; V fv = relation.get(iditer); int minIndex = 0; - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { double dist = df.doubleDistance(fv, means.get(i)); - if (dist < mindist) { + if(dist < mindist) { minIndex = i; mindist = dist; } } - if (clusters.get(minIndex).add(iditer)) { - changed = true; - // Remove from previous cluster - // TODO: keep a list of cluster assignments to save this search? - for (int i = 0; i < k; i++) { - if (i != minIndex) { - if (clusters.get(i).remove(iditer)) { - break; - } - } - } - } + changed |= updateAssignment(iditer, clusters, assignment, minIndex); } - } else { + } + else { final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction(); - for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { D mindist = df.getDistanceFactory().infiniteDistance(); V fv = relation.get(iditer); int minIndex = 0; - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { D dist = df.distance(fv, means.get(i)); - if (dist.compareTo(mindist) < 0) { + if(dist.compareTo(mindist) < 0) { minIndex = i; mindist = dist; } } - if (clusters.get(minIndex).add(iditer)) { - changed = true; - // Remove from previous cluster - // TODO: keep a list of cluster assignments to save this search? - for (int i = 0; i < k; i++) { - if (i != minIndex) { - if (clusters.get(i).remove(iditer)) { - break; - } - } - } - } + changed |= updateAssignment(iditer, clusters, assignment, minIndex); } } return changed; } + protected boolean updateAssignment(DBIDIter iditer, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, int newA) { + final int oldA = assignment.intValue(iditer); + if(oldA == newA) { + return false; + } + clusters.get(newA).add(iditer); + assignment.putInt(iditer, newA); + if(oldA >= 0) { + clusters.get(oldA).remove(iditer); + } + return true; + } + @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(new CombinedTypeInformation(TypeUtil.NUMBER_VECTOR_FIELD, getDistanceFunction().getInputTypeRestriction())); @@ -181,24 +174,28 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan * @return the mean vectors of the given clusters in the given database */ protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?>> means, Relation<V> database) { + // TODO: use Kahan summation for better numerical precision? List<Vector> newMeans = new ArrayList<>(k); - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { ModifiableDBIDs list = clusters.get(i); Vector mean = null; - if (list.size() > 0) { - double s = 1.0 / list.size(); + if(list.size() > 0) { DBIDIter iter = list.iter(); - assert (iter.valid()); - mean = database.get(iter).getColumnVector().timesEquals(s); + // Initialize with first. + mean = database.get(iter).getColumnVector(); double[] raw = mean.getArrayRef(); iter.advance(); - for (; iter.valid(); iter.advance()) { + // Update with remaining instances + for(; iter.valid(); iter.advance()) { NumberVector<?> vec = database.get(iter); - for (int j = 0; j < mean.getDimensionality(); j++) { - raw[j] += s * vec.doubleValue(j); + for(int j = 0; j < mean.getDimensionality(); j++) { + raw[j] += vec.doubleValue(j); } } - } else { + mean.timesEquals(1.0 / list.size()); + } + else { + // Keep degenerated means as-is for now. mean = means.get(i).getColumnVector(); } newMeans.add(mean); @@ -218,17 +215,18 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan final int dim = medians.get(0).getDimensionality(); final SortDBIDsBySingleDimension sorter = new SortDBIDsBySingleDimension(database); List<NumberVector<?>> newMedians = new ArrayList<>(k); - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { ArrayModifiableDBIDs list = DBIDUtil.newArray(clusters.get(i)); - if (list.size() > 0) { + if(list.size() > 0) { Vector mean = new Vector(dim); - for (int d = 0; d < dim; d++) { + for(int d = 0; d < dim; d++) { sorter.setDimension(d); DBID id = QuickSelect.median(list, sorter); mean.set(d, database.get(id).doubleValue(d)); } newMedians.add(mean); - } else { + } + else { newMedians.add((NumberVector<?>) medians.get(i)); } } @@ -244,14 +242,11 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan * @param op Cluster size change / Weight change */ protected void incrementalUpdateMean(Vector mean, V vec, int newsize, double op) { - if (newsize == 0) { + if(newsize == 0) { return; // Keep old mean } - Vector delta = vec.getColumnVector(); - // Compute difference from mean - delta.minusEquals(mean); - delta.timesEquals(op / newsize); - mean.plusEquals(delta); + Vector delta = vec.getColumnVector().minusEquals(mean); + mean.plusTimesEquals(delta, op / newsize); } /** @@ -260,76 +255,84 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan * @param relation Relation * @param means Means * @param clusters Clusters + * @param assignment Current cluster assignment * @return true when the means have changed */ - protected boolean macQueenIterate(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters) { + protected boolean macQueenIterate(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters, WritableIntegerDataStore assignment) { boolean changed = false; - if (getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { + if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { // Raw distance function @SuppressWarnings("unchecked") final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction(); // Incremental update - for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double mindist = Double.POSITIVE_INFINITY; V fv = relation.get(iditer); int minIndex = 0; - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { double dist = df.doubleDistance(fv, means.get(i)); - if (dist < mindist) { + if(dist < mindist) { minIndex = i; mindist = dist; } } - // Update the cluster mean incrementally: - for (int i = 0; i < k; i++) { - ModifiableDBIDs ci = clusters.get(i); - if (i == minIndex) { - if (ci.add(iditer)) { - incrementalUpdateMean(means.get(i), fv, ci.size(), +1); - changed = true; - } - } else if (ci.remove(iditer)) { - incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1); - changed = true; - } - } + changed |= updateMeanAndAssignment(clusters, means, minIndex, fv, iditer, assignment); } - } else { + } + else { // Raw distance function final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction(); // Incremental update - for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { D mindist = df.getDistanceFactory().infiniteDistance(); V fv = relation.get(iditer); int minIndex = 0; - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { D dist = df.distance(fv, means.get(i)); - if (dist.compareTo(mindist) < 0) { + if(dist.compareTo(mindist) < 0) { minIndex = i; mindist = dist; } } - // Update the cluster mean incrementally: - for (int i = 0; i < k; i++) { - ModifiableDBIDs ci = clusters.get(i); - if (i == minIndex) { - if (ci.add(iditer)) { - incrementalUpdateMean(means.get(i), fv, ci.size(), +1); - changed = true; - } - } else if (ci.remove(iditer)) { - incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1); - changed = true; - } - } + changed |= updateMeanAndAssignment(clusters, means, minIndex, fv, iditer, assignment); } } return changed; } + /** + * Try to update the cluster assignment. + * + * @param clusters Current clusters + * @param means Means to update + * @param minIndex Cluster to assign to + * @param fv Vector + * @param iditer Object ID + * @param assignment Current cluster assignment + * @return {@code true} when assignment changed + */ + private boolean updateMeanAndAssignment(List<ModifiableDBIDs> clusters, List<Vector> means, int minIndex, V fv, DBIDIter iditer, WritableIntegerDataStore assignment) { + int cur = assignment.intValue(iditer); + if(cur == minIndex) { + return false; + } + final ModifiableDBIDs curclus = clusters.get(minIndex); + curclus.add(iditer); + incrementalUpdateMean(means.get(minIndex), fv, curclus.size(), +1); + + if(cur >= 0) { + ModifiableDBIDs ci = clusters.get(cur); + ci.remove(iditer); + incrementalUpdateMean(means.get(cur), fv, ci.size() + 1, -1); + } + + assignment.putInt(iditer, minIndex); + return true; + } + @Override public void setK(int k) { this.k = k; @@ -366,27 +369,27 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan @Override protected void makeOptions(Parameterization config) { ObjectParameter<PrimitiveDistanceFunction<NumberVector<?>, D>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, PrimitiveDistanceFunction.class); - if (config.grab(distanceFunctionP)) { + if(config.grab(distanceFunctionP)) { distanceFunction = distanceFunctionP.instantiateClass(config); - if (!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) { + if(!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) { getLogger().warning("k-means optimizes the sum of squares - it should be used with squared euclidean distance and may stop converging otherwise!"); } } IntParameter kP = new IntParameter(K_ID); - kP.addConstraint(new GreaterConstraint(0)); - if (config.grab(kP)) { + kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(kP)) { k = kP.getValue(); } ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<>(INIT_ID, KMeansInitialization.class, RandomlyChosenInitialMeans.class); - if (config.grab(initialP)) { + if(config.grab(initialP)) { initializer = initialP.instantiateClass(config); } IntParameter maxiterP = new IntParameter(MAXITER_ID, 0); - maxiterP.addConstraint(new GreaterEqualConstraint(0)); - if (config.grab(maxiterP)) { + maxiterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT); + if(config.grab(maxiterP)) { maxiter = maxiterP.getValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java index 30bb640c..51e7ace9 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java @@ -38,7 +38,7 @@ import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; @@ -90,34 +90,35 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance< @Override public Clustering<M> run(Database database, Relation<V> relation) { - if (!(innerkMeans.getDistanceFunction() instanceof PrimitiveDistanceFunction)) { + if(!(innerkMeans.getDistanceFunction() instanceof PrimitiveDistanceFunction)) { throw new AbortException("K-Means results can only be evaluated for primitive distance functions, got: " + innerkMeans.getDistanceFunction().getClass()); } final PrimitiveDistanceFunction<? super V, D> df = (PrimitiveDistanceFunction<? super V, D>) innerkMeans.getDistanceFunction(); Clustering<M> bestResult = null; - if (trials > 1) { + if(trials > 1) { double bestCost = Double.POSITIVE_INFINITY; FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("K-means iterations", trials, LOG) : null; - for (int i = 0; i < trials; i++) { + for(int i = 0; i < trials; i++) { Clustering<M> currentCandidate = innerkMeans.run(database, relation); double currentCost = qualityMeasure.calculateCost(currentCandidate, df, relation); - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("Cost of candidate " + i + ": " + currentCost); } - if (currentCost < bestCost) { + if(currentCost < bestCost) { bestResult = currentCandidate; bestCost = currentCost; } - if (prog != null) { + if(prog != null) { prog.incrementProcessed(LOG); } } - if (prog != null) { + if(prog != null) { prog.ensureCompleted(LOG); } - } else { + } + else { bestResult = innerkMeans.run(database); } @@ -195,18 +196,18 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance< @Override protected void makeOptions(Parameterization config) { IntParameter trialsP = new IntParameter(TRIALS_ID); - trialsP.addConstraint(new GreaterEqualConstraint(1)); - if (config.grab(trialsP)) { + trialsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(trialsP)) { trials = trialsP.intValue(); } ObjectParameter<KMeans<V, D, M>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class); - if (config.grab(kMeansVariantP)) { + if(config.grab(kMeansVariantP)) { kMeansVariant = kMeansVariantP.instantiateClass(config); } ObjectParameter<KMeansQualityMeasure<V, ? super D>> qualityMeasureP = new ObjectParameter<>(QUALITYMEASURE_ID, KMeansQualityMeasure.class); - if (config.grab(qualityMeasureP)) { + if(config.grab(qualityMeasureP)) { qualityMeasure = qualityMeasureP.instantiateClass(config); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java index a018c04b..9edfd816 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java @@ -24,7 +24,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; */ import java.util.ArrayList; import java.util.List; -import java.util.Random; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.database.Database; @@ -74,7 +73,7 @@ public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> exten @Override public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) { // Get a distance query - if (!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) { + if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) { throw new AbortException("Farthest points K-Means initialization can only be used with numerical distances."); } @SuppressWarnings("unchecked") @@ -84,26 +83,25 @@ public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> exten // Chose first mean List<V> means = new ArrayList<>(k); - Random random = rnd.getRandom(); - DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, new Random(random.nextLong())).iter(); + DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, rnd).iter(); means.add(relation.get(first)); DBIDVar best = DBIDUtil.newVar(first); - for (int i = (dropfirst ? 0 : 1); i < k; i++) { + for(int i = (dropfirst ? 0 : 1); i < k; i++) { // Find farthest object: double maxdist = Double.NEGATIVE_INFINITY; - for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) { + for(DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) { double dsum = 0.; - for (V ex : means) { + for(V ex : means) { dsum += distQ.distance(ex, it).doubleValue(); } - if (dsum > maxdist) { + if(dsum > maxdist) { maxdist = dsum; best.set(it); } } // Add new mean: - if (k == 0) { + if(k == 0) { means.clear(); // Remove temporary first element. } means.add(relation.get(best)); @@ -114,7 +112,7 @@ public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> exten @Override public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) { - if (!(distQ2.getDistanceFactory() instanceof NumberDistance)) { + if(!(distQ2.getDistanceFactory() instanceof NumberDistance)) { throw new AbortException("Farthest points K-Means initialization can only be used with numerical distances."); } @SuppressWarnings("unchecked") @@ -123,26 +121,25 @@ public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> exten // Chose first mean ArrayModifiableDBIDs means = DBIDUtil.newArray(k); - Random random = rnd.getRandom(); - DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, new Random(random.nextLong())).iter(); + DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, rnd).iter(); means.add(first); DBIDVar best = DBIDUtil.newVar(first); - for (int i = (dropfirst ? 0 : 1); i < k; i++) { + for(int i = (dropfirst ? 0 : 1); i < k; i++) { // Find farthest object: double maxdist = Double.NEGATIVE_INFINITY; - for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) { + for(DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) { double dsum = 0.; - for (DBIDIter ex = means.iter(); ex.valid(); ex.advance()) { + for(DBIDIter ex = means.iter(); ex.valid(); ex.advance()) { dsum += distQ.distance(ex, it).doubleValue(); } - if (dsum > maxdist) { + if(dsum > maxdist) { maxdist = dsum; best.set(it); } } // Add new mean: - if (k == 0) { + if(k == 0) { means.clear(); // Remove temporary first element. } means.add(best); @@ -173,7 +170,7 @@ public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> exten protected void makeOptions(Parameterization config) { super.makeOptions(config); Flag dropfirstP = new Flag(DROPFIRST_ID); - if (config.grab(dropfirstP)) { + if(config.grab(dropfirstP)) { dropfirst = dropfirstP.isTrue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java new file mode 100644 index 00000000..aec4fe0f --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java @@ -0,0 +1,346 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.KMeansModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.RandomFactory; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; + +/** + * Provides the k-means algorithm, using Lloyd-style bulk iterations. + * + * However, in contrast to Lloyd's k-means and similar to MacQueen, we do update + * the mean vectors multiple times, not only at the very end of the iteration. + * This should yield faster convergence at little extra cost. + * + * To avoid issues with ordered data, we use random sampling to obtain the data + * blocks. + * + * @author Erich Schubert + * + * @apiviz.has KMeansModel + * + * @param <V> vector datatype + * @param <D> distance value type + */ +public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans<V, D, KMeansModel<V>> { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(KMeansBatchedLloyd.class); + + /** + * Number of blocks to use. + */ + int blocks; + + /** + * Random used for partitioning. + */ + RandomFactory random; + + /** + * Constructor. + * + * @param distanceFunction distance function + * @param k k parameter + * @param maxiter Maxiter parameter + * @param initializer Initialization method + * @param blocks Number of blocks + * @param random Random factory used for partitioning. + */ + public KMeansBatchedLloyd(PrimitiveDistanceFunction<NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer, int blocks, RandomFactory random) { + super(distanceFunction, k, maxiter, initializer); + this.blocks = blocks; + this.random = random; + } + + @Override + public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) { + final int dim = RelationUtil.dimensionality(relation); + // Choose initial means + List<? extends NumberVector<?>> mvs = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction()); + // Convert to (modifiable) math vectors. + List<Vector> means = new ArrayList<>(k); + for (NumberVector<?> m : mvs) { + means.add(m.getColumnVector()); + } + + // Setup cluster assignment store + List<ModifiableDBIDs> clusters = new ArrayList<>(); + for (int i = 0; i < k; i++) { + clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); + } + WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); + + ArrayDBIDs[] parts = DBIDUtil.randomSplit(relation.getDBIDs(), blocks, random); + + double[][] meanshift = new double[k][dim]; + int[] changesize = new int[k]; + + IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; + for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { + if (prog != null) { + prog.incrementProcessed(LOG); + } + boolean changed = false; + FiniteProgress pprog = LOG.isVerbose() ? new FiniteProgress("Batch", parts.length, LOG) : null; + for (int p = 0; p < parts.length; p++) { + // Initialize new means scratch space. + for (int i = 0; i < k; i++) { + Arrays.fill(meanshift[i], 0.); + } + Arrays.fill(changesize, 0); + changed |= assignToNearestCluster(relation, parts[p], means, meanshift, changesize, clusters, assignment); + // Recompute means. + updateMeans(means, meanshift, clusters, changesize); + if (pprog != null) { + pprog.incrementProcessed(LOG); + } + } + if (pprog != null) { + pprog.ensureCompleted(LOG); + } + // Stop if no cluster assignment changed. + if (!changed) { + break; + } + } + if (prog != null) { + prog.setCompleted(LOG); + } + + // Wrap result + final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); + Clustering<KMeansModel<V>> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); + for (int i = 0; i < clusters.size(); i++) { + KMeansModel<V> model = new KMeansModel<>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef())); + result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); + } + return result; + } + + /** + * Returns a list of clusters. The k<sup>th</sup> cluster contains the ids of + * those FeatureVectors, that are nearest to the k<sup>th</sup> mean. + * + * @param relation the database to cluster + * @param ids IDs to process + * @param oldmeans a list of k means + * @param meanshift delta to apply to each mean + * @param changesize New cluster sizes + * @param clusters cluster assignment + * @param assignment Current cluster assignment + * @return true when the object was reassigned + */ + protected boolean assignToNearestCluster(Relation<V> relation, DBIDs ids, List<? extends NumberVector<?>> oldmeans, double[][] meanshift, int[] changesize, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment) { + boolean changed = false; + + if (getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { + @SuppressWarnings("unchecked") + final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction(); + for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { + double mindist = Double.POSITIVE_INFINITY; + V fv = relation.get(iditer); + int minIndex = 0; + for (int i = 0; i < k; i++) { + double dist = df.doubleDistance(fv, oldmeans.get(i)); + if (dist < mindist) { + minIndex = i; + mindist = dist; + } + } + changed |= updateAssignment(iditer, fv, clusters, assignment, meanshift, changesize, minIndex); + } + } else { + final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction(); + for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { + D mindist = df.getDistanceFactory().infiniteDistance(); + V fv = relation.get(iditer); + int minIndex = 0; + for (int i = 0; i < k; i++) { + D dist = df.distance(fv, oldmeans.get(i)); + if (dist.compareTo(mindist) < 0) { + minIndex = i; + mindist = dist; + } + } + changed |= updateAssignment(iditer, fv, clusters, assignment, meanshift, changesize, minIndex); + } + } + return changed; + } + + /** + * Update the assignment of a single object. + * + * @param id Object to assign + * @param fv Vector + * @param clusters Clusters + * @param assignment Current cluster assignment + * @param meanshift Current shifting offset + * @param changesize Size change of the current cluster + * @param minIndex Index of best cluster. + * @return {@code true} when assignment changed. + */ + protected boolean updateAssignment(DBIDIter id, V fv, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, double[][] meanshift, int[] changesize, int minIndex) { + int cur = assignment.intValue(id); + if (cur == minIndex) { + return false; + } + // Add to new cluster. + { + clusters.get(minIndex).add(id); + changesize[minIndex]++; + double[] raw = meanshift[minIndex]; + for (int j = 0; j < fv.getDimensionality(); j++) { + raw[j] += fv.doubleValue(j); + } + } + // Remove from previous cluster + if (cur >= 0) { + clusters.get(cur).remove(id); + changesize[cur]--; + double[] raw = meanshift[cur]; + for (int j = 0; j < fv.getDimensionality(); j++) { + raw[j] -= fv.doubleValue(j); + } + } + assignment.putInt(id, minIndex); + return true; + } + + /** + * Merge changes into mean vectors. + * + * @param means Mean vectors + * @param meanshift Shift offset + * @param clusters + * @param changesize Size of change (for weighting!) + */ + protected void updateMeans(List<Vector> means, double[][] meanshift, List<ModifiableDBIDs> clusters, int[] changesize) { + for (int i = 0; i < k; i++) { + int newsize = clusters.get(i).size(), oldsize = newsize - changesize[i]; + if (newsize == 0) { + continue; // Keep previous mean vector. + } + if (oldsize == 0) { + means.set(i, new Vector(meanshift[i]).times(1. / newsize)); + continue; // Replace with new vector. + } + if (oldsize == newsize) { + means.get(i).plusTimesEquals(new Vector(meanshift[i]), 1. / (double) newsize); + continue; + } + means.get(i).timesEquals(oldsize / (double) newsize).plusTimesEquals(new Vector(meanshift[i]), 1. / (double) newsize); + } + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> { + /** + * Parameter for the number of blocks. + */ + public static final OptionID BLOCKS_ID = new OptionID("kmeans.blocks", "Number of blocks to use for processing. Means will be recomputed after each block."); + + /** + * Random source for blocking. + */ + public static final OptionID RANDOM_ID = new OptionID("kmeans.blocks.random", "Random source for producing blocks."); + + /** + * Number of blocks. + */ + int blocks; + + /** + * Random used for partitioning. + */ + RandomFactory random; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter blocksP = new IntParameter(BLOCKS_ID, 10); + blocksP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); + if (config.grab(blocksP)) { + blocks = blocksP.intValue(); + } + RandomParameter randomP = new RandomParameter(RANDOM_ID); + if (config.grab(randomP)) { + random = randomP.getValue(); + } + } + + @Override + protected Logging getLogger() { + return LOG; + } + + @Override + protected KMeansBatchedLloyd<V, D> makeInstance() { + return new KMeansBatchedLloyd<>(distanceFunction, k, maxiter, initializer, blocks, random); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java index 37071d36..80a581b1 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java @@ -41,7 +41,7 @@ import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; @@ -205,7 +205,7 @@ public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M super.makeOptions(config); IntParameter kP = new IntParameter(KMeans.K_ID); - kP.addConstraint(new GreaterConstraint(1)); + kP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); if (config.grab(kP)) { k = kP.intValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java new file mode 100644 index 00000000..2a60ef27 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java @@ -0,0 +1,155 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.KMeansModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; + +/** + * Provides the k-means algorithm, alternating between MacQueen-style + * incremental processing and Lloyd-Style batch steps. + * + * @author Erich Schubert + * + * @apiviz.landmark + * @apiviz.has KMeansModel + * + * @param <V> vector datatype + * @param <D> distance value type + */ +public class KMeansHybridLloydMacQueen<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans<V, D, KMeansModel<V>> { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(KMeansHybridLloydMacQueen.class); + + /** + * Constructor. + * + * @param distanceFunction distance function + * @param k k parameter + * @param maxiter Maxiter parameter + * @param initializer Initialization method + */ + public KMeansHybridLloydMacQueen(PrimitiveDistanceFunction<NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { + super(distanceFunction, k, maxiter, initializer); + } + + @Override + public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) { + if (relation.size() <= 0) { + return new Clustering<>("k-Means Clustering", "kmeans-clustering"); + } + // Choose initial means + List<Vector> means = new ArrayList<>(k); + for (NumberVector<?> nv : initializer.chooseInitialMeans(database, relation, k, getDistanceFunction())) { + means.add(nv.getColumnVector()); + } + // Setup cluster assignment store + List<ModifiableDBIDs> clusters = new ArrayList<>(); + for (int i = 0; i < k; i++) { + clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); + } + WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); + + IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; + for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration += 2) { + { // MacQueen + if (prog != null) { + prog.incrementProcessed(LOG); + } + boolean changed = macQueenIterate(relation, means, clusters, assignment); + if (!changed) { + break; + } + } + { // Lloyd + if (prog != null) { + prog.incrementProcessed(LOG); + } + boolean changed = assignToNearestCluster(relation, means, clusters, assignment); + // Stop if no cluster assignment changed. + if (!changed) { + break; + } + // Recompute means. + means = means(clusters, means, relation); + } + } + if (prog != null) { + prog.setCompleted(LOG); + } + + // Wrap result + final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); + Clustering<KMeansModel<V>> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); + for (int i = 0; i < clusters.size(); i++) { + KMeansModel<V> model = new KMeansModel<>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef())); + result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); + } + return result; + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> { + @Override + protected Logging getLogger() { + return LOG; + } + + @Override + protected KMeansHybridLloydMacQueen<V, D> makeInstance() { + return new KMeansHybridLloydMacQueen<>(distanceFunction, k, maxiter, initializer); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java index e692293c..686e2076 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java @@ -31,6 +31,9 @@ import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.model.KMeansModel; import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -93,15 +96,16 @@ public class KMeansLloyd<V extends NumberVector<?>, D extends Distance<D>> exten // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { - clusters.add(DBIDUtil.newHashSet(relation.size() / k)); + clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); } + WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { if (prog != null) { prog.incrementProcessed(LOG); } - boolean changed = assignToNearestCluster(relation, means, clusters); + boolean changed = assignToNearestCluster(relation, means, clusters, assignment); // Stop if no cluster assignment changed. if (!changed) { break; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java index bb689bd3..a0f4bb3f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java @@ -31,6 +31,9 @@ import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.model.KMeansModel; import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; @@ -95,11 +98,9 @@ public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> ex // Initialize cluster and assign objects List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { - clusters.add(DBIDUtil.newHashSet(relation.size() / k)); + clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); } - assignToNearestCluster(relation, means, clusters); - // Initial recomputation of the means. - means = means(clusters, means, relation); + WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; // Refine result @@ -107,7 +108,7 @@ public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> ex if (prog != null) { prog.incrementProcessed(LOG); } - boolean changed = macQueenIterate(relation, means, clusters); + boolean changed = macQueenIterate(relation, means, clusters, assignment); if (!changed) { break; } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java index 302ca86b..6fc514eb 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java @@ -84,8 +84,8 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten // Chose first mean List<V> means = new ArrayList<>(k); - Random random = rnd.getRandom(); - DBID first = DBIDUtil.deref(DBIDUtil.randomSample(relation.getDBIDs(), 1, new Random(random.nextLong())).iter()); + Random random = rnd.getSingleThreadedRandom(); + DBID first = DBIDUtil.deref(DBIDUtil.randomSample(relation.getDBIDs(), 1, random).iter()); means.add(relation.get(first)); ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs()); @@ -134,8 +134,8 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten // Chose first mean ArrayModifiableDBIDs means = DBIDUtil.newArray(k); - Random random = rnd.getRandom(); - DBID first = DBIDUtil.deref(DBIDUtil.randomSample(distQ.getRelation().getDBIDs(), 1, new Random(random.nextLong())).iter()); + Random random = rnd.getSingleThreadedRandom(); + DBID first = DBIDUtil.deref(DBIDUtil.randomSample(distQ.getRelation().getDBIDs(), 1, random).iter()); means.add(first); ArrayDBIDs ids = DBIDUtil.ensureArray(distQ.getRelation().getDBIDs()); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java index cc7aaa9e..0a97c4d3 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java @@ -31,6 +31,9 @@ import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.model.MeanModel; import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -88,15 +91,16 @@ public class KMediansLloyd<V extends NumberVector<?>, D extends Distance<D>> ext // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { - clusters.add(DBIDUtil.newHashSet(relation.size() / k)); + clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); } + WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medians iteration", LOG) : null; for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { if (prog != null) { prog.incrementProcessed(LOG); } - boolean changed = assignToNearestCluster(relation, medians, clusters); + boolean changed = assignToNearestCluster(relation, medians, clusters, assignment); // Stop if no cluster assignment changed. if (!changed) { break; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java index 87a0c7ae..41cca225 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java @@ -48,8 +48,7 @@ import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.math.Mean; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; @@ -119,7 +118,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista * @return result */ public Clustering<MedoidModel> run(Database database, Relation<V> relation) { - if (relation.size() <= 0) { + if(relation.size() <= 0) { return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); } DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction()); @@ -127,7 +126,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ)); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet(relation.size() / k)); } Mean[] mdists = Mean.newArray(k); @@ -139,47 +138,47 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medoids iteration", LOG) : null; // Swap phase boolean changed = true; - while (changed) { - if (prog != null) { + while(changed) { + if(prog != null) { prog.incrementProcessed(LOG); } changed = false; // Try to swap the medoid with a better cluster member: int i = 0; - for (DBIDIter miter = medoids.iter(); miter.valid(); miter.advance(), i++) { + for(DBIDIter miter = medoids.iter(); miter.valid(); miter.advance(), i++) { DBID best = null; Mean bestm = mdists[i]; - for (DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) { - if (DBIDUtil.equal(miter, iter)) { + for(DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) { + if(DBIDUtil.equal(miter, iter)) { continue; } Mean mdist = new Mean(); - for (DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) { + for(DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) { mdist.put(distQ.distance(iter, iter2).doubleValue()); } - if (mdist.getMean() < bestm.getMean()) { + if(mdist.getMean() < bestm.getMean()) { best = DBIDUtil.deref(iter); bestm = mdist; } } - if (best != null && !DBIDUtil.equal(miter, best)) { + if(best != null && !DBIDUtil.equal(miter, best)) { changed = true; medoids.set(i, best); mdists[i] = bestm; } } // Reassign - if (changed) { + if(changed) { assignToNearestCluster(medoids, mdists, clusters, distQ); } } - if (prog != null) { + if(prog != null) { prog.setCompleted(LOG); } // Wrap result Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); - for (int i = 0; i < clusters.size(); i++) { + for(int i = 0; i < clusters.size(); i++) { MedoidModel model = new MedoidModel(medoids.get(i)); result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); } @@ -200,27 +199,27 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista boolean changed = false; double[] dists = new double[k]; - for (DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { int minIndex = 0; double mindist = Double.POSITIVE_INFINITY; { int i = 0; - for (DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) { + for(DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) { dists[i] = distQ.distance(iditer, miter).doubleValue(); - if (dists[i] < mindist) { + if(dists[i] < mindist) { minIndex = i; mindist = dists[i]; } } } - if (clusters.get(minIndex).add(iditer)) { + if(clusters.get(minIndex).add(iditer)) { changed = true; mdist[minIndex].put(mindist); // Remove from previous cluster // TODO: keep a list of cluster assignments to save this search? - for (int i = 0; i < k; i++) { - if (i != minIndex) { - if (clusters.get(i).remove(iditer)) { + for(int i = 0; i < k; i++) { + if(i != minIndex) { + if(clusters.get(i).remove(iditer)) { mdist[minIndex].put(dists[i], -1); break; } @@ -259,19 +258,19 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista protected void makeOptions(Parameterization config) { super.makeOptions(config); IntParameter kP = new IntParameter(KMeans.K_ID); - kP.addConstraint(new GreaterConstraint(0)); - if (config.grab(kP)) { + kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(kP)) { k = kP.intValue(); } ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class); - if (config.grab(initialP)) { + if(config.grab(initialP)) { initializer = initialP.instantiateClass(config); } IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, 0); - maxiterP.addConstraint(new GreaterEqualConstraint(0)); - if (config.grab(maxiterP)) { + maxiterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT); + if(config.grab(maxiterP)) { maxiter = maxiterP.intValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java index 1feda867..c9e1dc47 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java @@ -53,8 +53,7 @@ import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; @@ -124,7 +123,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist * @return result */ public Clustering<MedoidModel> run(Database database, Relation<V> relation) { - if (relation.size() <= 0) { + if(relation.size() <= 0) { return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); } DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction()); @@ -133,7 +132,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ)); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet(relation.size() / k)); } @@ -145,8 +144,8 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("PAM iteration", LOG) : null; // Swap phase boolean changed = true; - while (changed) { - if (prog != null) { + while(changed) { + if(prog != null) { prog.incrementProcessed(LOG); } changed = false; @@ -155,57 +154,60 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist DBID bestid = null; int bestcluster = -1; int i = 0; - for (DBIDIter miter = medoids.iter(); miter.valid(); miter.advance(), i++) { - for (DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) { - if (DBIDUtil.equal(miter, iter)) { + for(DBIDIter miter = medoids.iter(); miter.valid(); miter.advance(), i++) { + for(DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) { + if(DBIDUtil.equal(miter, iter)) { continue; } // double disti = distQ.distance(id, med).doubleValue(); double cost = 0; DBIDIter olditer = medoids.iter(); - for (int j = 0; j < k; j++, olditer.advance()) { - for (DBIDIter iter2 = clusters.get(j).iter(); iter2.valid(); iter2.advance()) { + for(int j = 0; j < k; j++, olditer.advance()) { + for(DBIDIter iter2 = clusters.get(j).iter(); iter2.valid(); iter2.advance()) { double distcur = distQ.distance(iter2, olditer).doubleValue(); double distnew = distQ.distance(iter2, iter).doubleValue(); - if (j == i) { + if(j == i) { // Cases 1 and 2. double distsec = second.doubleValue(iter2); - if (distcur > distsec) { + if(distcur > distsec) { // Case 1, other would switch to a third medoid cost += distsec - distcur; // Always positive! - } else { // Would remain with the candidate + } + else { // Would remain with the candidate cost += distnew - distcur; // Could be negative } - } else { + } + else { // Cases 3-4: objects from other clusters - if (distcur < distnew) { + if(distcur < distnew) { // Case 3: no change - } else { + } + else { // Case 4: would switch to new medoid cost += distnew - distcur; // Always negative } } } } - if (cost < best) { + if(cost < best) { best = cost; bestid = DBIDUtil.deref(iter); bestcluster = i; } } } - if (prog != null) { + if(prog != null) { prog.setCompleted(LOG); } - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { LOG.debug("Best cost: " + best); } - if (bestid != null) { + if(bestid != null) { changed = true; medoids.set(bestcluster, bestid); } // Reassign - if (changed) { + if(changed) { // TODO: can we save some of these recomputations? assignToNearestCluster(medoids, ids, second, clusters, distQ); } @@ -213,7 +215,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist // Wrap result Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); - for (int i = 0; i < clusters.size(); i++) { + for(int i = 0; i < clusters.size(); i++) { MedoidModel model = new MedoidModel(medoids.get(i)); result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); } @@ -234,30 +236,31 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist protected boolean assignToNearestCluster(ArrayDBIDs means, DBIDs ids, WritableDoubleDataStore second, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V, D> distQ) { boolean changed = false; - for (DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { int minIndex = 0; double mindist = Double.POSITIVE_INFINITY; double mindist2 = Double.POSITIVE_INFINITY; { int i = 0; - for (DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) { + for(DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) { double dist = distQ.distance(iditer, miter).doubleValue(); - if (dist < mindist) { + if(dist < mindist) { minIndex = i; mindist2 = mindist; mindist = dist; - } else if (dist < mindist2) { + } + else if(dist < mindist2) { mindist2 = dist; } } } - if (clusters.get(minIndex).add(iditer)) { + if(clusters.get(minIndex).add(iditer)) { changed = true; // Remove from previous cluster // TODO: keep a list of cluster assignments to save this search? - for (int i = 0; i < k; i++) { - if (i != minIndex) { - if (clusters.get(i).remove(iditer)) { + for(int i = 0; i < k; i++) { + if(i != minIndex) { + if(clusters.get(i).remove(iditer)) { break; } } @@ -296,19 +299,19 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist protected void makeOptions(Parameterization config) { super.makeOptions(config); IntParameter kP = new IntParameter(KMeans.K_ID); - kP.addConstraint(new GreaterConstraint(0)); - if (config.grab(kP)) { + kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(kP)) { k = kP.intValue(); } ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class); - if (config.grab(initialP)) { + if(config.grab(initialP)) { initializer = initialP.instantiateClass(config); } IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, 0); - maxiterP.addConstraint(new GreaterEqualConstraint(0)); - if (config.grab(maxiterP)) { + maxiterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT); + if(config.grab(maxiterP)) { maxiter = maxiterP.intValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java index ee90e0dc..1329132e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java @@ -60,7 +60,7 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<?>> extends Ab NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation); List<V> means = new ArrayList<>(k); - final Random random = rnd.getRandom(); + final Random random = rnd.getSingleThreadedRandom(); for(int i = 0; i < k; i++) { double[] r = MathUtil.randomDoubleArray(dim, random); // Rescale diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java index 9f0a1923..79013364 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java @@ -93,7 +93,7 @@ public class SampleKMeansInitialization<V extends NumberVector<?>, D extends Dis Clustering<? extends MeanModel<V>> clusters = innerkMeans.run(proxydb, proxyv); List<V> means = new ArrayList<>(); for (Cluster<? extends MeanModel<V>> cluster : clusters.getAllClusters()) { - means.add((V) cluster.getModel().getMean()); + means.add(cluster.getModel().getMean()); } return means; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java index ed9a528d..1be19bd1 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java @@ -1,4 +1,27 @@ /** * Quality measures for k-Means results. */ + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality;
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/KNNKernelDensityMinimaClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/KNNKernelDensityMinimaClustering.java new file mode 100644 index 00000000..55114f7d --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/KNNKernelDensityMinimaClustering.java @@ -0,0 +1,384 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.onedimensional; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.VectorUtil; +import de.lmu.ifi.dbs.elki.data.model.ClusterModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; +import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.EpanechnikovKernelDensityFunction; +import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.KernelDensityFunction; +import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.EnumParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Cluster one-dimensional data by splitting the data set on local minima after + * performing kernel density estimation. + * + * @author Erich Schubert + */ +public class KNNKernelDensityMinimaClustering<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<ClusterModel>> implements ClusteringAlgorithm<Clustering<ClusterModel>> { + /** + * Class logger. + */ + private static final Logging LOG = Logging.getLogger(KNNKernelDensityMinimaClustering.class); + + /** + * Estimation mode. + * + * @apiviz.exclude + */ + public static enum Mode { + BALLOON, // Balloon estimator + SAMPLE, // Sample-point estimator + } + + /** + * Dimension to use for clustering. + */ + protected int dim; + + /** + * Kernel density function. + */ + protected KernelDensityFunction kernel; + + /** + * Estimation modes. + */ + protected Mode mode; + + /** + * Number of neighbors to use for bandwidth. + */ + protected int k; + + /** + * Window width, for local minima criterions. + */ + protected int minwindow; + + /** + * Constructor. + * + * @param dim Dimension to use for clustering + * @param kernel Kernel function + * @param mode Bandwidth mode + * @param k Number of neighbors + * @param minwindow Window size for comparison + */ + public KNNKernelDensityMinimaClustering(int dim, KernelDensityFunction kernel, Mode mode, int k, int minwindow) { + super(); + this.dim = dim; + this.kernel = kernel; + this.mode = mode; + this.k = k; + this.minwindow = minwindow; + } + + /** + * Run the clustering algorithm on a data relation. + * + * @param relation Relation + * @return Clustering result + */ + public Clustering<ClusterModel> run(Relation<V> relation) { + ArrayModifiableDBIDs ids = DBIDUtil.newArray(relation.getDBIDs()); + final int size = ids.size(); + + // Sort by the sole dimension + ids.sort(new VectorUtil.SortDBIDsBySingleDimension(relation, dim)); + + // Density storage. + WritableDoubleDataStore density = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, 0.); + + DBIDArrayIter iter = ids.iter(), iter2 = ids.iter(); + + StepProgress sprog = LOG.isVerbose() ? new StepProgress("Clustering steps", 2) : null; + + if(sprog != null) { + sprog.beginStep(1, "Kernel density estimation.", LOG); + } + { + double[] scratch = new double[2 * k]; + iter.seek(0); + for(int i = 0; i < size; i++, iter.advance()) { + // Current value. + final double curv = relation.get(iter).doubleValue(dim); + + final int pre = Math.max(i - k, 0), prek = i - pre; + final int pos = Math.min(i + k, size - 1), posk = pos - i; + iter2.seek(pre); + for(int j = 0; j < prek; j++, iter2.advance()) { + scratch[j] = curv - relation.get(iter2).doubleValue(dim); + } + assert (iter2.getOffset() == i); + iter2.advance(); + for(int j = 0; j < posk; j++, iter2.advance()) { + scratch[prek + j] = relation.get(iter2).doubleValue(dim) - curv; + } + + assert (prek + posk >= k); + double kdist = QuickSelect.quickSelect(scratch, 0, prek + posk, k); + switch(mode){ + case BALLOON: { + double dens = 0.; + if(kdist > 0.) { + for(int j = 0; j < prek + posk; j++) { + dens += kernel.density(scratch[j] / kdist); + } + } + else { + dens = Double.POSITIVE_INFINITY; + } + assert (iter.getOffset() == i); + density.putDouble(iter, dens); + break; + } + case SAMPLE: { + if(kdist > 0.) { + iter2.seek(pre); + for(int j = 0; j < prek; j++, iter2.advance()) { + double delta = curv - relation.get(iter2).doubleValue(dim); + density.putDouble(iter2, density.doubleValue(iter2) + kernel.density(delta / kdist)); + } + assert (iter2.getOffset() == i); + iter2.advance(); + for(int j = 0; j < posk; j++, iter2.advance()) { + double delta = relation.get(iter2).doubleValue(dim) - curv; + density.putDouble(iter2, density.doubleValue(iter2) + kernel.density(delta / kdist)); + } + } + else { + iter2.seek(pre); + for(int j = 0; j < prek; j++, iter2.advance()) { + double delta = curv - relation.get(iter2).doubleValue(dim); + if(!(delta > 0.)) { + density.putDouble(iter2, Double.POSITIVE_INFINITY); + } + } + assert (iter2.getOffset() == i); + iter2.advance(); + for(int j = 0; j < posk; j++, iter2.advance()) { + double delta = relation.get(iter2).doubleValue(dim) - curv; + if(!(delta > 0.)) { + density.putDouble(iter2, Double.POSITIVE_INFINITY); + } + } + } + break; + } + default: + throw new UnsupportedOperationException("Unknown mode specified."); + } + } + } + + if(sprog != null) { + sprog.beginStep(2, "Local minima detection.", LOG); + } + Clustering<ClusterModel> clustering = new Clustering<>("onedimensional-kde-clustering", "One-Dimensional clustering using kernel density estimation."); + { + double[] scratch = new double[2 * minwindow + 1]; + int begin = 0; + int halfw = (minwindow + 1) >> 1; + iter.seek(0); + // Fill initial buffer. + for(int i = 0; i < size; i++, iter.advance()) { + final int m = i % scratch.length, t = (i - minwindow - 1) % scratch.length; + scratch[m] = density.doubleValue(iter); + if(i > scratch.length) { + double min = Double.POSITIVE_INFINITY; + for(int j = 0; j < scratch.length; j++) { + if(j != t && scratch[j] < min) { + min = scratch[j]; + } + } + // Local minimum: + if(scratch[t] < min) { + int end = i - minwindow + 1; + { // Test on which side the kNN is + iter2.seek(end); + double curv = relation.get(iter2).doubleValue(dim); + iter2.seek(end - halfw); + double left = relation.get(iter2).doubleValue(dim) - curv; + iter2.seek(end + halfw); + double right = curv - relation.get(iter2).doubleValue(dim); + if(left < right) { + end++; + } + } + iter2.seek(begin); + ArrayModifiableDBIDs cids = DBIDUtil.newArray(end - begin); + for(int j = 0; j < end - begin; j++, iter2.advance()) { + cids.add(iter2); + } + clustering.addToplevelCluster(new Cluster<>(cids, ClusterModel.CLUSTER)); + begin = end; + } + } + } + // Extract last cluster + int end = size; + iter2.seek(begin); + ArrayModifiableDBIDs cids = DBIDUtil.newArray(end - begin); + for(int j = 0; j < end - begin; j++, iter2.advance()) { + cids.add(iter2); + } + clustering.addToplevelCluster(new Cluster<>(cids, ClusterModel.CLUSTER)); + } + + if(sprog != null) { + sprog.setCompleted(LOG); + } + return clustering; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(new VectorFieldTypeInformation<>(NumberVector.class, dim + 1, Integer.MAX_VALUE)); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer { + /** + * Dimension to use for clustering. + */ + public static final OptionID DIM_ID = new OptionID("kernelcluster.dim", "Dimension to use for clustering. For one-dimensional data, use 0."); + + /** + * Kernel function. + */ + public static final OptionID KERNEL_ID = new OptionID("kernelcluster.kernel", "Kernel function for density estimation."); + + /** + * KDE mode. + */ + public static final OptionID MODE_ID = new OptionID("kernelcluster.mode", "Kernel density estimation mode (baloon estimator vs. sample point estimator)."); + + /** + * Number of neighbors for bandwidth estimation. + */ + public static final OptionID K_ID = new OptionID("kernelcluster.knn", "Number of nearest neighbors to use for bandwidth estimation."); + + /** + * Half window width to find local minima. + */ + public static final OptionID WINDOW_ID = new OptionID("kernelcluster.window", "Half width of sliding window to find local minima."); + + /** + * Dimension to use for clustering. + */ + protected int dim; + + /** + * Kernel density function. + */ + protected KernelDensityFunction kernel; + + /** + * Estimation modes. + */ + protected Mode mode; + + /** + * Number of neighbors to use for bandwidth. + */ + protected int k; + + /** + * Window width, for local minima criterions. + */ + protected int minwindow; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter dimP = new IntParameter(DIM_ID, 0); + dimP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT); + if(config.grab(dimP)) { + dim = dimP.intValue(); + } + + ObjectParameter<KernelDensityFunction> kernelP = new ObjectParameter<>(KERNEL_ID, KernelDensityFunction.class, EpanechnikovKernelDensityFunction.class); + if(config.grab(kernelP)) { + kernel = kernelP.instantiateClass(config); + } + + EnumParameter<Mode> modeP = new EnumParameter<>(MODE_ID, Mode.class, Mode.BALLOON); + if(config.grab(modeP)) { + mode = modeP.getValue(); + } + + IntParameter kP = new IntParameter(K_ID); + kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(kP)) { + k = kP.intValue(); + } + + IntParameter windowP = new IntParameter(WINDOW_ID); + windowP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(windowP)) { + minwindow = windowP.intValue(); + } + } + + @Override + protected KNNKernelDensityMinimaClustering<V> makeInstance() { + return new KNNKernelDensityMinimaClustering<>(dim, kernel, mode, k, minwindow); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/package-info.java new file mode 100644 index 00000000..c6c55244 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/package-info.java @@ -0,0 +1,27 @@ +/** + * Clustering algorithms for one-dimensional data. + */ + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +package de.lmu.ifi.dbs.elki.algorithm.clustering.onedimensional;
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java index db026e93..617d74cd 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java @@ -56,8 +56,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; @@ -594,14 +593,14 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster protected void makeOptions(Parameterization config) { super.makeOptions(config); IntParameter xsiP = new IntParameter(XSI_ID); - xsiP.addConstraint(new GreaterConstraint(0)); + xsiP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(xsiP)) { xsi = xsiP.intValue(); } DoubleParameter tauP = new DoubleParameter(TAU_ID); - tauP.addConstraint(new GreaterConstraint(0)); - tauP.addConstraint(new LessConstraint(1)); + tauP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); + tauP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE); if(config.grab(tauP)) { tau = tauP.doubleValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DOC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DOC.java new file mode 100644 index 00000000..5f798a66 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DOC.java @@ -0,0 +1,605 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures +
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team +
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version. +
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details. +
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.BitSet;
+import java.util.Random;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.Subspace;
+import de.lmu.ifi.dbs.elki.data.model.SubspaceModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceMaximumDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid;
+import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
+
+/**
+ * <p>
+ * Provides the DOC algorithm, and it's heuristic variant, FastDOC. DOC is a
+ * sampling based subspace clustering algorithm.
+ * </p>
+ *
+ * <p>
+ * Reference: <br/>
+ * C. M. Procopiuc, M. Jones, P. K. Agarwal, T. M. Murali<br />
+ * A Monte Carlo algorithm for fast projective clustering. <br/>
+ * In: Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '02).
+ * </p>
+ *
+ * @author Florian Nuecke
+ *
+ * @apiviz.has SubspaceModel
+ *
+ * @param <V> the type of NumberVector handled by this Algorithm.
+ */
+@Title("DOC: Density-based Optimal projective Clustering")
+@Reference(authors = "C. M. Procopiuc, M. Jones, P. K. Agarwal, T. M. Murali", title = "A Monte Carlo algorithm for fast projective clustering", booktitle = "Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '02)", url = "http://dx.doi.org/10.1145/564691.564739")
+public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(DOC.class);
+
+ /**
+ * Relative density threshold parameter alpha.
+ */
+ private double alpha;
+
+ /**
+ * Balancing parameter for importance of points vs. dimensions
+ */
+ private double beta;
+
+ /**
+ * Half width parameter.
+ */
+ private double w;
+
+ /**
+ * Holds the value of {@link Parameterizer#HEURISTICS_ID}.
+ */
+ private boolean heuristics;
+
+ /**
+ * Holds the value of {@link Parameterizer#D_ZERO_ID}.
+ */
+ private int d_zero;
+
+ /**
+ * Randomizer used internally for sampling points.
+ */
+ private RandomFactory rnd;
+
+ /**
+ * Constructor.
+ *
+ * @param alpha α relative density threshold.
+ * @param beta β balancing parameter for size vs. dimensionality.
+ * @param w <em>w</em> half width parameter.
+ * @param heuristics whether to use heuristics (FastDOC) or not.
+ * @param random Random factory
+ */
+ public DOC(double alpha, double beta, double w, boolean heuristics, int d_zero, RandomFactory random) {
+ this.alpha = alpha;
+ this.beta = beta;
+ this.w = w;
+ this.heuristics = heuristics;
+ this.d_zero = d_zero;
+ this.rnd = random;
+ }
+
+ /**
+ * Performs the DOC or FastDOC (as configured) algorithm on the given
+ * Database.
+ *
+ * <p>
+ * This will run exhaustively, i.e. run DOC until no clusters are found
+ * anymore / the database size has shrunk below the threshold for minimum
+ * cluster size.
+ * </p>
+ *
+ * @param database Database
+ * @param relation Data relation
+ */
+ public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) {
+ // Dimensionality of our set.
+ final int d = RelationUtil.dimensionality(relation);
+
+ // Get available DBIDs as a set we can remove items from.
+ ArrayModifiableDBIDs S = DBIDUtil.newArray(relation.getDBIDs());
+
+ // Precompute values as described in Figure 2.
+ double r = Math.abs(Math.log(d + d) / Math.log(beta * .5));
+ // Outer loop count.
+ int n = (int) (2. / alpha);
+ // Inner loop count.
+ int m = (int) (Math.pow(2. / alpha, r) * Math.log(4));
+ if(heuristics) {
+ m = Math.min(m, Math.min(1000000, d * d));
+ }
+
+ // Minimum size for a cluster for it to be accepted.
+ int minClusterSize = (int) (alpha * S.size());
+
+ // List of all clusters we found.
+ Clustering<SubspaceModel<V>> result = new Clustering<>("DOC Clusters", "DOC");
+
+ // Inform the user about the number of actual clusters found so far.
+ IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null;
+
+ // To not only find a single cluster, we continue running until our set
+ // of points is empty.
+ while(S.size() > minClusterSize) {
+ Cluster<SubspaceModel<V>> C;
+ if(heuristics) {
+ C = runFastDOC(relation, S, d, n, m, (int) r);
+ }
+ else {
+ C = runDOC(relation, S, d, n, m, (int) r, minClusterSize);
+ }
+
+ if(C == null) {
+ // Stop trying if we couldn't find a cluster.
+ break;
+ }
+ // Found a cluster, remember it, remove its points from the set.
+ result.addToplevelCluster(C);
+
+ // Remove all points of the cluster from the set and continue.
+ S.removeDBIDs(C.getIDs());
+
+ if(cprogress != null) {
+ cprogress.setProcessed(result.getAllClusters().size(), LOG);
+ }
+ }
+
+ // Add the remainder as noise.
+ if(S.size() > 0) {
+ BitSet alldims = new BitSet();
+ alldims.set(0, d);
+ result.addToplevelCluster(new Cluster<>(S, true, new SubspaceModel<>(new Subspace(alldims), Centroid.make(relation, S).toVector(relation))));
+ }
+
+ if(cprogress != null) {
+ cprogress.setCompleted(LOG);
+ }
+
+ return result;
+ }
+
+ /**
+ * Performs a single run of DOC, finding a single cluster.
+ *
+ * @param relation used to get actual values for DBIDs.
+ * @param S The set of points we're working on.
+ * @param d Dimensionality of the data set we're currently working on.
+ * @param r Size of random samples.
+ * @param m Number of inner iterations (per seed point).
+ * @param n Number of outer iterations (seed points).
+ * @param minClusterSize Minimum size a cluster must have to be accepted.
+ * @return a cluster, if one is found, else <code>null</code>.
+ */
+ private Cluster<SubspaceModel<V>> runDOC(Relation<V> relation, ArrayModifiableDBIDs S, final int d, int n, int m, int r, int minClusterSize) {
+ final DoubleDistance wd = new DoubleDistance(w);
+ // Best cluster for the current run.
+ DBIDs C = null;
+ // Relevant attributes for the best cluster.
+ BitSet D = null;
+ // Quality of the best cluster.
+ double quality = Double.NEGATIVE_INFINITY;
+
+ // Bounds for our cluster.
+ // ModifiableHyperBoundingBox bounds = new ModifiableHyperBoundingBox(new
+ // double[d], new double[d]);
+
+ // Weights for distance (= rectangle query)
+ SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(new BitSet(d));
+ DistanceQuery<V, DoubleDistance> dq = relation.getDatabase().getDistanceQuery(relation, df);
+ RangeQuery<V, DoubleDistance> rq = relation.getDatabase().getRangeQuery(dq);
+
+ // Inform the user about the progress in the current iteration.
+ FiniteProgress iprogress = LOG.isVerbose() ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG) : null;
+
+ Random random = rnd.getSingleThreadedRandom();
+ DBIDArrayIter iter = S.iter();
+
+ for(int i = 0; i < n; ++i) {
+ // Pick a random seed point.
+ iter.seek(random.nextInt(S.size()));
+
+ for(int j = 0; j < m; ++j) {
+ // Choose a set of random points.
+ DBIDs randomSet = DBIDUtil.randomSample(S, Math.min(S.size(), r), random);
+
+ // Initialize cluster info.
+ BitSet nD = new BitSet(d);
+
+ // Test each dimension and build bounding box.
+ for(int k = 0; k < d; ++k) {
+ if(dimensionIsRelevant(k, relation, randomSet)) {
+ nD.set(k);
+ }
+ }
+ if(nD.cardinality() > 0) {
+ // Get all points in the box.
+ df.setSelectedDimensions(nD);
+ // TODO: add filtering capabilities into query API!
+ DBIDs nC = DBIDUtil.intersection(S, rq.getRangeForDBID(iter, wd));
+
+ if(LOG.isDebuggingFiner()) {
+ LOG.finer("Testing a cluster candidate, |C| = " + nC.size() + ", |D| = " + nD.cardinality());
+ }
+
+ // Is the cluster large enough?
+ if(nC.size() < minClusterSize) {
+ // Too small.
+ if(LOG.isDebuggingFiner()) {
+ LOG.finer("... but it's too small.");
+ }
+ }
+ else {
+ // Better cluster than before?
+ double nQuality = computeClusterQuality(nC.size(), nD.cardinality());
+ if(nQuality > quality) {
+ if(LOG.isDebuggingFiner()) {
+ LOG.finer("... and it's the best so far: " + nQuality + " vs. " + quality);
+ }
+ C = nC;
+ D = nD;
+ quality = nQuality;
+ }
+ else {
+ if(LOG.isDebuggingFiner()) {
+ LOG.finer("... but we already have a better one.");
+ }
+ }
+ }
+ }
+
+ if(iprogress != null) {
+ iprogress.incrementProcessed(LOG);
+ }
+ }
+ }
+
+ if(iprogress != null) {
+ iprogress.ensureCompleted(LOG);
+ }
+
+ if(C != null) {
+ return makeCluster(relation, C, D);
+ }
+ else {
+ return null;
+ }
+ }
+
+ /**
+ * Performs a single run of FastDOC, finding a single cluster.
+ *
+ * @param relation used to get actual values for DBIDs.
+ * @param S The set of points we're working on.
+ * @param d Dimensionality of the data set we're currently working on.
+ * @param r Size of random samples.
+ * @param m Number of inner iterations (per seed point).
+ * @param n Number of outer iterations (seed points).
+ * @return a cluster, if one is found, else <code>null</code>.
+ */
+ private Cluster<SubspaceModel<V>> runFastDOC(Relation<V> relation, ArrayModifiableDBIDs S, int d, int n, int m, int r) {
+ // Relevant attributes of highest cardinality.
+ BitSet D = null;
+ // The seed point for the best dimensions.
+ DBIDVar dV = DBIDUtil.newVar();
+
+ // Inform the user about the progress in the current iteration.
+ FiniteProgress iprogress = LOG.isVerbose() ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG) : null;
+
+ Random random = rnd.getSingleThreadedRandom();
+
+ DBIDArrayIter iter = S.iter();
+ outer: for(int i = 0; i < n; ++i) {
+ // Pick a random seed point.
+ iter.seek(random.nextInt(S.size()));
+
+ for(int j = 0; j < m; ++j) {
+ // Choose a set of random points.
+ DBIDs randomSet = DBIDUtil.randomSample(S, Math.min(S.size(), r), random);
+
+ // Initialize cluster info.
+ BitSet nD = new BitSet(d);
+
+ // Test each dimension.
+ for(int k = 0; k < d; ++k) {
+ if(dimensionIsRelevant(k, relation, randomSet)) {
+ nD.set(k);
+ }
+ }
+
+ if(D == null || nD.cardinality() > D.cardinality()) {
+ D = nD;
+ dV.set(iter);
+
+ if(D.cardinality() >= d_zero) {
+ if(iprogress != null) {
+ iprogress.setProcessed(iprogress.getTotal(), LOG);
+ }
+ break outer;
+ }
+ }
+
+ if(iprogress != null) {
+ iprogress.incrementProcessed(LOG);
+ }
+ }
+ }
+
+ if(iprogress != null) {
+ iprogress.ensureCompleted(LOG);
+ }
+
+ // If no relevant dimensions were found, skip it.
+ if(D == null || D.cardinality() == 0) {
+ return null;
+ }
+
+ // Get all points in the box.
+ SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(D);
+ DistanceQuery<V, DoubleDistance> dq = relation.getDatabase().getDistanceQuery(relation, df);
+ RangeQuery<V, DoubleDistance> rq = relation.getDatabase().getRangeQuery(dq, DatabaseQuery.HINT_SINGLE);
+
+ // TODO: add filtering capabilities into query API!
+ DBIDs C = DBIDUtil.intersection(S, rq.getRangeForDBID(dV, new DoubleDistance(w)));
+
+ // If we have a non-empty cluster, return it.
+ if(C.size() > 0) {
+ return makeCluster(relation, C, D);
+ }
+ else {
+ return null;
+ }
+ }
+
+ /**
+ * Utility method to test if a given dimension is relevant as determined via a
+ * set of reference points (i.e. if the variance along the attribute is lower
+ * than the threshold).
+ *
+ * @param dimension the dimension to test.
+ * @param relation used to get actual values for DBIDs.
+ * @param points the points to test.
+ * @return <code>true</code> if the dimension is relevant.
+ */
+ private boolean dimensionIsRelevant(int dimension, Relation<V> relation, DBIDs points) {
+ double min = Double.POSITIVE_INFINITY;
+ double max = Double.NEGATIVE_INFINITY;
+ for(DBIDIter iter = points.iter(); iter.valid(); iter.advance()) {
+ V xV = relation.get(iter);
+ min = Math.min(min, xV.doubleValue(dimension));
+ max = Math.max(max, xV.doubleValue(dimension));
+ if(max - min > w) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Utility method to create a subspace cluster from a list of DBIDs and the
+ * relevant attributes.
+ *
+ * @param relation to compute a centroid.
+ * @param C the cluster points.
+ * @param D the relevant dimensions.
+ * @return an object representing the subspace cluster.
+ */
+ private Cluster<SubspaceModel<V>> makeCluster(Relation<V> relation, DBIDs C, BitSet D) {
+ DBIDs ids = DBIDUtil.newHashSet(C); // copy, also to lose distance values!
+ Cluster<SubspaceModel<V>> cluster = new Cluster<>(ids);
+ cluster.setModel(new SubspaceModel<>(new Subspace(D), Centroid.make(relation, ids).toVector(relation)));
+ return cluster;
+ }
+
+ /**
+ * Computes the quality of a cluster based on its size and number of relevant
+ * attributes, as described via the μ-function from the paper.
+ *
+ * @param clusterSize the size of the cluster.
+ * @param numRelevantDimensions the number of dimensions relevant to the
+ * cluster.
+ * @return a quality measure (only use this to compare the quality to that
+ * other clusters).
+ */
+ private double computeClusterQuality(int clusterSize, int numRelevantDimensions) {
+ return clusterSize * Math.pow(1. / beta, numRelevantDimensions);
+ }
+
+ // ---------------------------------------------------------------------- //
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Florian Nuecke
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ /**
+ * Relative density threshold parameter Alpha.
+ */
+ public static final OptionID ALPHA_ID = new OptionID("doc.alpha", "Minimum relative density for a set of points to be considered a cluster (|C|>=doc.alpha*|S|).");
+
+ /**
+ * Balancing parameter for importance of points vs. dimensions
+ */
+ public static final OptionID BETA_ID = new OptionID("doc.beta", "Preference of cluster size versus number of relevant dimensions (higher value means higher priority on larger clusters).");
+
+ /**
+ * Half width parameter.
+ */
+ public static final OptionID W_ID = new OptionID("doc.w", "Maximum extent of scattering of points along a single attribute for the attribute to be considered relevant.");
+
+ /**
+ * Parameter to enable FastDOC heuristics.
+ */
+ public static final OptionID HEURISTICS_ID = new OptionID("doc.fastdoc", "Use heuristics as described, thus using the FastDOC algorithm (not yet implemented).");
+
+ /**
+ * Stopping threshold for FastDOC.
+ */
+ public static final OptionID D_ZERO_ID = new OptionID("doc.d0", "Parameter for FastDOC, setting the number of relevant attributes which, when found for a cluster, are deemed enough to stop iterating.");
+
+ /**
+ * Random seeding parameter.
+ */
+ public static final OptionID RANDOM_ID = new OptionID("doc.random-seed", "Random seed, for reproducible experiments.");
+
+ /**
+ * Relative density threshold parameter Alpha.
+ */
+ protected double alpha;
+
+ /**
+ * Balancing parameter for importance of points vs. dimensions
+ */
+ protected double beta;
+
+ /**
+ * Half width parameter.
+ */
+ protected double w;
+
+ /**
+ * Parameter to enable FastDOC heuristics.
+ */
+ protected boolean heuristics;
+
+ /**
+ * Stopping threshold for FastDOC.
+ */
+ protected int d_zero;
+
+ /**
+ * Random seeding factory.
+ */
+ protected RandomFactory random = RandomFactory.DEFAULT;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+
+ {
+ DoubleParameter param = new DoubleParameter(ALPHA_ID, 0.2);
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ param.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE);
+ if(config.grab(param)) {
+ alpha = param.getValue();
+ }
+ }
+
+ {
+ DoubleParameter param = new DoubleParameter(BETA_ID, 0.8);
+ param.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ param.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
+ if(config.grab(param)) {
+ beta = param.getValue();
+ }
+ }
+
+ {
+ DoubleParameter param = new DoubleParameter(W_ID, 0.05);
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ if(config.grab(param)) {
+ w = param.getValue();
+ }
+ }
+
+ {
+ Flag param = new Flag(HEURISTICS_ID);
+ if(config.grab(param)) {
+ heuristics = param.getValue();
+ }
+ }
+
+ if(heuristics) {
+ IntParameter param = new IntParameter(D_ZERO_ID, 5);
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(param)) {
+ d_zero = param.getValue();
+ }
+ }
+
+ {
+ RandomParameter param = new RandomParameter(RANDOM_ID);
+ if(config.grab(param)) {
+ random = param.getValue();
+ }
+ }
+ }
+
+ @Override
+ protected DOC<V> makeInstance() {
+ return new DOC<>(alpha, beta, w, heuristics, d_zero, random);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java index b17ebebb..cd5e51b8 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java @@ -69,8 +69,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; @@ -170,12 +169,12 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin */ public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) { // Instantiate DiSH distance (and thus run the preprocessor) - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("*** Run DiSH preprocessor."); } DiSHDistanceFunction.Instance<V> dishDistanceQuery = dishDistance.instantiate(relation); // Configure and run OPTICS. - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("*** Run OPTICS algorithm."); } ListParameterization opticsconfig = new ListParameterization(opticsAlgorithmParameters); @@ -186,7 +185,7 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin optics = opticsconfig.tryInstantiate(cls); ClusterOrderResult<PreferenceVectorBasedCorrelationDistance> opticsResult = optics.run(database, relation); - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("*** Compute Clusters."); } return computeClusters(relation, opticsResult, dishDistanceQuery); @@ -206,10 +205,10 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin // extract clusters Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap = extractClusters(database, distFunc, clusterOrder); - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { StringBuilder msg = new StringBuilder("Step 1: extract clusters"); - for (List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { - for (Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { + for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { + for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { msg.append('\n').append(FormatUtil.format(dimensionality, c.first)).append(" ids ").append(c.second.size()); } } @@ -218,10 +217,10 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin // check if there are clusters < minpts checkClusters(database, distFunc, clustersMap, minpts); - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { StringBuilder msg = new StringBuilder("Step 2: check clusters"); - for (List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { - for (Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { + for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { + for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { msg.append('\n').append(FormatUtil.format(dimensionality, c.first)).append(" ids ").append(c.second.size()); } } @@ -230,9 +229,9 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin // sort the clusters List<Cluster<SubspaceModel<V>>> clusters = sortClusters(database, clustersMap); - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { StringBuilder msg = new StringBuilder("Step 3: sort clusters"); - for (Cluster<SubspaceModel<V>> c : clusters) { + for(Cluster<SubspaceModel<V>> c : clusters) { msg.append('\n').append(FormatUtil.format(dimensionality, c.getModel().getSubspace().getDimensions())).append(" ids ").append(c.size()); } LOG.verbose(msg.toString()); @@ -241,14 +240,14 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin // build the hierarchy Clustering<SubspaceModel<V>> clustering = new Clustering<>("DiSH clustering", "dish-clustering"); buildHierarchy(database, distFunc, clustering, clusters, dimensionality); - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { StringBuilder msg = new StringBuilder("Step 4: build hierarchy"); - for (Cluster<SubspaceModel<V>> c : clusters) { + for(Cluster<SubspaceModel<V>> c : clusters) { msg.append('\n').append(FormatUtil.format(dimensionality, c.getModel().getDimensions())).append(" ids ").append(c.size()); - for (Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterParents(c); iter.valid(); iter.advance()) { + for(Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterParents(c); iter.valid(); iter.advance()) { msg.append("\n parent ").append(iter.get()); } - for (Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterChildren(c); iter.valid(); iter.advance()) { + for(Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterChildren(c); iter.valid(); iter.advance()) { msg.append("\n child ").append(iter.get()); } } @@ -256,8 +255,8 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } // build result - for (Cluster<SubspaceModel<V>> c : clusters) { - if (clustering.getClusterHierarchy().numParents(c) == 0) { + for(Cluster<SubspaceModel<V>> c : clusters) { + if(clustering.getClusterHierarchy().numParents(c) == 0) { clustering.addToplevelCluster(c); } } @@ -278,7 +277,7 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap = new HashMap<>(); Map<DBID, ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> entryMap = new HashMap<>(); Map<DBID, Pair<BitSet, ArrayModifiableDBIDs>> entryToClusterMap = new HashMap<>(); - for (Iterator<ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> it = clusterOrder.iterator(); it.hasNext();) { + for(Iterator<ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> it = clusterOrder.iterator(); it.hasNext();) { ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> entry = it.next(); entryMap.put(entry.getID(), entry); @@ -287,43 +286,43 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin // get the list of (parallel) clusters for the preference vector List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(preferenceVector); - if (parallelClusters == null) { + if(parallelClusters == null) { parallelClusters = new ArrayList<>(); clustersMap.put(preferenceVector, parallelClusters); } // look for the proper cluster Pair<BitSet, ArrayModifiableDBIDs> cluster = null; - for (Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { + for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { V c_centroid = ProjectedCentroid.make(c.first, database, c.second).toVector(database); PreferenceVectorBasedCorrelationDistance dist = distFunc.correlationDistance(object, c_centroid, preferenceVector, preferenceVector); - if (dist.getCorrelationValue() == entry.getReachability().getCorrelationValue()) { + if(dist.getCorrelationValue() == entry.getReachability().getCorrelationValue()) { double d = distFunc.weightedDistance(object, c_centroid, dist.getCommonPreferenceVector()); - if (d <= 2 * epsilon) { + if(d <= 2 * epsilon) { cluster = c; break; } } } - if (cluster == null) { + if(cluster == null) { cluster = new Pair<>(preferenceVector, DBIDUtil.newArray()); parallelClusters.add(cluster); } cluster.second.add(entry.getID()); entryToClusterMap.put(entry.getID(), cluster); - if (progress != null) { + if(progress != null) { progress.setProcessed(++processed, LOG); } } - if (progress != null) { + if(progress != null) { progress.ensureCompleted(LOG); } - if (LOG.isDebuggingFiner()) { + if(LOG.isDebuggingFiner()) { StringBuilder msg = new StringBuilder("Step 0"); - for (List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { - for (Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { + for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { + for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { msg.append('\n').append(FormatUtil.format(RelationUtil.dimensionality(database), c.first)).append(" ids ").append(c.second.size()); } } @@ -331,24 +330,24 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } // add the predecessor to the cluster - for (BitSet pv : clustersMap.keySet()) { + for(BitSet pv : clustersMap.keySet()) { List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv); - for (Pair<BitSet, ArrayModifiableDBIDs> cluster : parallelClusters) { - if (cluster.second.isEmpty()) { + for(Pair<BitSet, ArrayModifiableDBIDs> cluster : parallelClusters) { + if(cluster.second.isEmpty()) { continue; } DBID firstID = cluster.second.get(0); ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> entry = entryMap.get(firstID); DBID predecessorID = entry.getPredecessorID(); - if (predecessorID == null) { + if(predecessorID == null) { continue; } ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> predecessor = entryMap.get(predecessorID); // parallel cluster - if (predecessor.getReachability().getCommonPreferenceVector().equals(entry.getReachability().getCommonPreferenceVector())) { + if(predecessor.getReachability().getCommonPreferenceVector().equals(entry.getReachability().getCommonPreferenceVector())) { continue; } - if (predecessor.getReachability().compareTo(entry.getReachability()) < 0) { + if(predecessor.getReachability().compareTo(entry.getReachability()) < 0) { continue; } @@ -375,16 +374,17 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin final int db_dim = RelationUtil.dimensionality(database); // int num = 1; List<Cluster<SubspaceModel<V>>> clusters = new ArrayList<>(); - for (BitSet pv : clustersMap.keySet()) { + for(BitSet pv : clustersMap.keySet()) { List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv); - for (int i = 0; i < parallelClusters.size(); i++) { + for(int i = 0; i < parallelClusters.size(); i++) { Pair<BitSet, ArrayModifiableDBIDs> c = parallelClusters.get(i); Cluster<SubspaceModel<V>> cluster = new Cluster<>(c.second); cluster.setModel(new SubspaceModel<>(new Subspace(c.first), Centroid.make(database, c.second).toVector(database))); String subspace = FormatUtil.format(cluster.getModel().getSubspace().getDimensions(), db_dim, ""); - if (parallelClusters.size() > 1) { + if(parallelClusters.size() > 1) { cluster.setName("Cluster_" + subspace + "_" + i); - } else { + } + else { cluster.setName("Cluster_" + subspace); } clusters.add(cluster); @@ -417,11 +417,11 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin List<Pair<BitSet, ArrayModifiableDBIDs>> notAssigned = new ArrayList<>(); Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> newClustersMap = new HashMap<>(); Pair<BitSet, ArrayModifiableDBIDs> noise = new Pair<>(new BitSet(), DBIDUtil.newArray()); - for (BitSet pv : clustersMap.keySet()) { + for(BitSet pv : clustersMap.keySet()) { // noise - if (pv.cardinality() == 0) { + if(pv.cardinality() == 0) { List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv); - for (Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { + for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { noise.second.addDBIDs(c.second); } } @@ -429,10 +429,11 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin else { List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv); List<Pair<BitSet, ArrayModifiableDBIDs>> newParallelClusters = new ArrayList<>(parallelClusters.size()); - for (Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { - if (!pv.equals(new BitSet()) && c.second.size() < minpts) { + for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { + if(!pv.equals(new BitSet()) && c.second.size() < minpts) { notAssigned.add(c); - } else { + } + else { newParallelClusters.add(c); } } @@ -443,14 +444,15 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin clustersMap.clear(); clustersMap.putAll(newClustersMap); - for (Pair<BitSet, ArrayModifiableDBIDs> c : notAssigned) { - if (c.second.isEmpty()) { + for(Pair<BitSet, ArrayModifiableDBIDs> c : notAssigned) { + if(c.second.isEmpty()) { continue; } Pair<BitSet, ArrayModifiableDBIDs> parent = findParent(database, distFunc, c, clustersMap); - if (parent != null) { + if(parent != null) { parent.second.addDBIDs(c.second); - } else { + } + else { noise.second.addDBIDs(c.second); } } @@ -477,23 +479,23 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin BitSet childPV = child.first; int childCardinality = childPV.cardinality(); - for (BitSet parentPV : clustersMap.keySet()) { + for(BitSet parentPV : clustersMap.keySet()) { int parentCardinality = parentPV.cardinality(); - if (parentCardinality >= childCardinality) { + if(parentCardinality >= childCardinality) { continue; } - if (resultCardinality != -1 && parentCardinality <= resultCardinality) { + if(resultCardinality != -1 && parentCardinality <= resultCardinality) { continue; } BitSet pv = (BitSet) childPV.clone(); pv.and(parentPV); - if (pv.equals(parentPV)) { + if(pv.equals(parentPV)) { List<Pair<BitSet, ArrayModifiableDBIDs>> parentList = clustersMap.get(parentPV); - for (Pair<BitSet, ArrayModifiableDBIDs> parent : parentList) { + for(Pair<BitSet, ArrayModifiableDBIDs> parent : parentList) { V parent_centroid = ProjectedCentroid.make(parentPV, database, parent.second).toVector(database); double d = distFunc.weightedDistance(child_centroid, parent_centroid, parentPV); - if (d <= 2 * epsilon) { + if(d <= 2 * epsilon) { result = parent; resultCardinality = parentCardinality; break; @@ -519,57 +521,59 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin final int db_dim = RelationUtil.dimensionality(database); Hierarchy<Cluster<SubspaceModel<V>>> hier = clustering.getClusterHierarchy(); - for (int i = 0; i < clusters.size() - 1; i++) { + for(int i = 0; i < clusters.size() - 1; i++) { Cluster<SubspaceModel<V>> c_i = clusters.get(i); int subspaceDim_i = dimensionality - c_i.getModel().getSubspace().dimensionality(); V ci_centroid = ProjectedCentroid.make(c_i.getModel().getDimensions(), database, c_i.getIDs()).toVector(database); - for (int j = i + 1; j < clusters.size(); j++) { + for(int j = i + 1; j < clusters.size(); j++) { Cluster<SubspaceModel<V>> c_j = clusters.get(j); int subspaceDim_j = dimensionality - c_j.getModel().getSubspace().dimensionality(); - if (subspaceDim_i < subspaceDim_j) { - if (LOG.isDebugging()) { + if(subspaceDim_i < subspaceDim_j) { + if(LOG.isDebugging()) { msg.append("\n l_i=").append(subspaceDim_i).append(" pv_i=[").append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions())).append(']'); msg.append("\n l_j=").append(subspaceDim_j).append(" pv_j=[").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions())).append(']'); } // noise level reached - if (c_j.getModel().getSubspace().dimensionality() == 0) { + if(c_j.getModel().getSubspace().dimensionality() == 0) { // no parents exists -> parent is noise - if (hier.numParents(c_i) == 0) { + if(hier.numParents(c_i) == 0) { clustering.addChildCluster(c_j, c_i); - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { msg.append("\n [").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions())); msg.append("] is parent of [").append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions())); msg.append(']'); } } - } else { + } + else { V cj_centroid = ProjectedCentroid.make(c_j.getModel().getDimensions(), database, c_j.getIDs()).toVector(database); PreferenceVectorBasedCorrelationDistance distance = distFunc.correlationDistance(ci_centroid, cj_centroid, c_i.getModel().getSubspace().getDimensions(), c_j.getModel().getSubspace().getDimensions()); double d = distFunc.weightedDistance(ci_centroid, cj_centroid, distance.getCommonPreferenceVector()); - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { msg.append("\n dist = ").append(distance.getCorrelationValue()); } - if (distance.getCorrelationValue() == subspaceDim_j) { - if (LOG.isDebugging()) { + if(distance.getCorrelationValue() == subspaceDim_j) { + if(LOG.isDebugging()) { msg.append("\n d = ").append(d); } - if (d <= 2 * epsilon) { + if(d <= 2 * epsilon) { // no parent exists or c_j is not a parent of the already // existing parents - if (hier.numParents(c_i) == 0 || !isParent(database, distFunc, c_j, hier.iterParents(c_i))) { + if(hier.numParents(c_i) == 0 || !isParent(database, distFunc, c_j, hier.iterParents(c_i))) { clustering.addChildCluster(c_j, c_i); - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { msg.append("\n [").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions())); msg.append("] is parent of ["); msg.append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions())); msg.append(']'); } } - } else { + } + else { throw new RuntimeException("Should never happen: d = " + d); } } @@ -577,7 +581,7 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } } } - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { LOG.debug(msg.toString()); } } @@ -599,11 +603,11 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin int dimensionality = RelationUtil.dimensionality(database); int subspaceDim_parent = dimensionality - parent.getModel().getSubspace().dimensionality(); - for (; iter.valid(); iter.advance()) { + for(; iter.valid(); iter.advance()) { Cluster<SubspaceModel<V>> child = iter.get(); V child_centroid = ProjectedCentroid.make(child.getModel().getDimensions(), database, child.getIDs()).toVector(database); PreferenceVectorBasedCorrelationDistance distance = distFunc.correlationDistance(parent_centroid, child_centroid, parent.getModel().getSubspace().getDimensions(), child.getModel().getSubspace().getDimensions()); - if (distance.getCorrelationValue() == subspaceDim_parent) { + if(distance.getCorrelationValue() == subspaceDim_parent) { return true; } } @@ -642,14 +646,14 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin super.makeOptions(config); DoubleParameter epsilonP = new DoubleParameter(EPSILON_ID, 0.001); - epsilonP.addConstraint(new GreaterEqualConstraint(0)); - if (config.grab(epsilonP)) { + epsilonP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE); + if(config.grab(epsilonP)) { epsilon = epsilonP.doubleValue(); } IntParameter muP = new IntParameter(MU_ID, 1); - muP.addConstraint(new GreaterConstraint(0)); - if (config.grab(muP)) { + muP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(muP)) { mu = muP.intValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java index 9ac7c072..3f135564 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java @@ -34,8 +34,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -95,8 +94,8 @@ public class HiSC<V extends NumberVector<?>> extends OPTICS<V, PreferenceVectorB protected void makeOptions(Parameterization config) {
super.makeOptions(config);
DoubleParameter alphaP = new DoubleParameter(HiSCPreferenceVectorIndex.Factory.ALPHA_ID, HiSCPreferenceVectorIndex.Factory.DEFAULT_ALPHA);
- alphaP.addConstraint(new GreaterConstraint(0.0)); - alphaP.addConstraint(new LessConstraint(1.0));
+ alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); + alphaP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
double alpha = 0.0;
if(config.grab(alphaP)) {
alpha = alphaP.doubleValue();
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java new file mode 100644 index 00000000..9d1ee94d --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java @@ -0,0 +1,1000 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.Iterator;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.EM;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.Subspace;
+import de.lmu.ifi.dbs.elki.data.VectorUtil;
+import de.lmu.ifi.dbs.elki.data.VectorUtil.SortDBIDsBySingleDimension;
+import de.lmu.ifi.dbs.elki.data.model.SubspaceModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDMIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.SetDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.MutableProgress;
+import de.lmu.ifi.dbs.elki.logging.progress.StepProgress;
+import de.lmu.ifi.dbs.elki.math.MathUtil;
+import de.lmu.ifi.dbs.elki.math.MeanVariance;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.VMath;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.ChiSquaredDistribution;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.PoissonDistribution;
+import de.lmu.ifi.dbs.elki.utilities.BitsUtil;
+import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+
+/**
+ * P3C: A Robust Projected Clustering Algorithm.
+ *
+ * <p>
+ * Reference: <br/>
+ * Gabriela Moise, Jörg Sander, Martin Ester<br />
+ * P3C: A Robust Projected Clustering Algorithm.<br/>
+ * In: Proc. Sixth International Conference on Data Mining (ICDM '06)
+ * </p>
+ *
+ * This is not a complete implementation of P3C, but good enough for most users.
+ * Improvements are welcome. The most obviously missing step is section 3.5 of
+ * P3C, where the cluster subspaces are refined.
+ *
+ * @author Florian Nuecke
+ * @author Erich Schubert
+ *
+ * @apiviz.uses EM
+ * @apiviz.has SubspaceModel
+ * @apiviz.has ClusterCandidate
+ * @apiviz.has Signature
+ *
+ * @param <V> the type of NumberVector handled by this Algorithm.
+ */
+@Title("P3C: A Robust Projected Clustering Algorithm.")
+@Reference(authors = "Gabriela Moise, Jörg Sander, Martin Ester", title = "P3C: A Robust Projected Clustering Algorithm", booktitle = "Proc. Sixth International Conference on Data Mining (ICDM '06)", url = "http://dx.doi.org/10.1109/ICDM.2006.123")
+public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(P3C.class);
+
+ /**
+ * Parameter for the Poisson test threshold.
+ */
+ protected double poissonThreshold;
+
+ /**
+ * Maximum number of iterations for the EM step.
+ */
+ protected int maxEmIterations;
+
+ /**
+ * Threshold when to stop EM iterations.
+ */
+ protected double emDelta;
+
+ /**
+ * Minimum cluster size for noise flagging. (Not existing in the original
+ * publication).
+ */
+ protected int minClusterSize;
+
+ /**
+ * Alpha threshold for testing.
+ */
+ protected double alpha = 0.001;
+
+ /**
+ * Constructor.
+ *
+ * @param alpha ChiSquared test threshold
+ * @param poissonThreshold Poisson test threshold
+ * @param maxEmIterations Maximum number of EM iterations
+ * @param emDelta EM stopping threshold
+ * @param minClusterSize Minimum cluster size
+ */
+ public P3C(double alpha, double poissonThreshold, int maxEmIterations, double emDelta, int minClusterSize) {
+ super();
+ this.alpha = alpha;
+ this.poissonThreshold = poissonThreshold;
+ this.maxEmIterations = maxEmIterations;
+ this.emDelta = emDelta;
+ this.minClusterSize = minClusterSize;
+ }
+
+ /**
+ * Performs the P3C algorithm on the given Database.
+ */
+ public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) {
+ final int dim = RelationUtil.dimensionality(relation);
+
+ // Overall progress.
+ StepProgress stepProgress = LOG.isVerbose() ? new StepProgress(8) : null;
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(1, "Grid-partitioning data.", LOG);
+ }
+
+ // Desired number of bins, as per Sturge:
+ final int binCount = (int) Math.ceil(1 + (Math.log(relation.size()) / MathUtil.LOG2));
+
+ // Perform 1-dimensional projections, and split into bins.
+ SetDBIDs[][] partitions = partitionData(relation, binCount);
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(2, "Searching for non-uniform bins in support histograms.", LOG);
+ }
+
+ // Set markers for each attribute until they're all deemed uniform.
+ final long[][] markers = new long[dim][];
+ int numuniform = 0;
+ for(int d = 0; d < dim; d++) {
+ final SetDBIDs[] parts = partitions[d];
+ if(parts == null) {
+ continue; // Never mark any on constant dimensions.
+ }
+ final long[] marked = markers[d] = BitsUtil.zero(binCount);
+ int card = 0;
+ while(card < dim - 1) {
+ // Find bin with largest support, test only the dimensions that were not
+ // previously marked.
+ int bestBin = chiSquaredUniformTest(parts, marked, card);
+ if(bestBin < 0) {
+ numuniform++;
+ break; // Uniform
+ }
+ BitsUtil.setI(marked, bestBin);
+ card++;
+ }
+ if(LOG.isDebugging()) {
+ LOG.debug("Marked bins in dim " + d + ": " + BitsUtil.toString(marked, binCount));
+ }
+ }
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(3, "Merging marked bins to 1-signatures.", LOG);
+ }
+
+ ArrayList<Signature> signatures = constructOneSignatures(partitions, markers);
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(4, "Computing cluster cores from merged p-signatures.", LOG);
+ }
+
+ ArrayList<Signature> clusterCores = mergeClusterCores(binCount, signatures);
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(5, "Pruning redundant cluster cores.", LOG);
+ }
+
+ clusterCores = pruneRedundantClusterCores(clusterCores);
+ if(LOG.isVerbose()) {
+ LOG.verbose("Number of cluster cores found: " + clusterCores.size());
+ }
+
+ if(clusterCores.size() == 0) {
+ stepProgress.setCompleted(LOG);
+ Clustering<SubspaceModel<V>> c = new Clustering<>("P3C", "P3C");
+ c.addToplevelCluster(new Cluster<SubspaceModel<V>>(relation.getDBIDs(), true));
+ return c;
+ }
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(5, "Refining cluster cores to clusters via EM.", LOG);
+ }
+
+ // Track objects not assigned to any cluster:
+ ModifiableDBIDs noise = DBIDUtil.newHashSet();
+ WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
+ int k = clusterCores.size();
+ double[] clusterWeights = new double[k];
+ computeFuzzyMembership(relation, clusterCores, noise, probClusterIGivenX, clusterWeights);
+
+ // Initial estimate of covariances, to assign noise objects
+ Vector[] means = new Vector[k];
+ Matrix[] covarianceMatrices = new Matrix[k], invCovMatr = new Matrix[k];
+ final double norm = MathUtil.powi(MathUtil.TWOPI, dim);
+ double[] normDistrFactor = new double[k];
+ Arrays.fill(normDistrFactor, 1. / Math.sqrt(norm));
+ EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, means, covarianceMatrices, dim);
+ EM.computeInverseMatrixes(covarianceMatrices, invCovMatr, normDistrFactor, norm);
+ assignUnassigned(relation, probClusterIGivenX, means, invCovMatr, clusterWeights, noise);
+
+ double emNew = EM.assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX);
+ for(int it = 1; it <= maxEmIterations || maxEmIterations < 0; it++) {
+ final double emOld = emNew;
+ EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, means, covarianceMatrices, dim);
+ EM.computeInverseMatrixes(covarianceMatrices, invCovMatr, normDistrFactor, norm);
+ // reassign probabilities
+ emNew = EM.assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX);
+
+ if(LOG.isVerbose()) {
+ LOG.verbose("iteration " + it + " - expectation value: " + emNew);
+ }
+ if((emNew - emOld) <= emDelta) {
+ break;
+ }
+ }
+
+ // Perform EM clustering.
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(6, "Generating hard clustering.", LOG);
+ }
+
+ // Create a hard clustering, making sure each data point only is part of one
+ // cluster, based on the best match from the membership matrix.
+ ArrayList<ClusterCandidate> clusterCandidates = hardClustering(probClusterIGivenX, clusterCores, relation.getDBIDs());
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(7, "Looking for outliers and moving them to the noise set.", LOG);
+ }
+
+ // Outlier detection. Remove points from clusters that have a Mahalanobis
+ // distance larger than the critical value of the ChiSquare distribution.
+ findOutliers(relation, means, invCovMatr, clusterCandidates, dim - numuniform, noise);
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(8, "Removing empty clusters.", LOG);
+ }
+
+ // Remove near-empty clusters.
+ for(Iterator<ClusterCandidate> it = clusterCandidates.iterator(); it.hasNext();) {
+ ClusterCandidate cand = it.next();
+ final int size = cand.ids.size();
+ if(size < minClusterSize) {
+ if(size > 0) {
+ noise.addDBIDs(cand.ids);
+ }
+ it.remove();
+ }
+ }
+
+ if(LOG.isVerbose()) {
+ LOG.verbose("Number of clusters remaining: " + clusterCandidates.size());
+ }
+
+ // TODO Check all attributes previously deemed uniform (section 3.5).
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(9, "Generating final result.", LOG);
+ }
+
+ // Generate final output.
+ Clustering<SubspaceModel<V>> result = new Clustering<>("P3C", "P3C");
+ for(int cluster = 0; cluster < clusterCandidates.size(); ++cluster) {
+ ClusterCandidate candidate = clusterCandidates.get(cluster);
+ CovarianceMatrix cvm = CovarianceMatrix.make(relation, candidate.ids);
+ result.addToplevelCluster(new Cluster<>(candidate.ids, new SubspaceModel<>(new Subspace(candidate.dimensions), cvm.getMeanVector(relation))));
+ }
+ LOG.verbose("Noise size: " + noise.size());
+ if(noise.size() > 0) {
+ result.addToplevelCluster(new Cluster<SubspaceModel<V>>(noise, true));
+ }
+
+ if(stepProgress != null) {
+ stepProgress.ensureCompleted(LOG);
+ }
+
+ return result;
+ }
+
+ /**
+ * Construct the 1-signatures by merging adjacent dense bins.
+ *
+ * @param partitions Initial partitions.
+ * @param markers Markers for dense partitions.
+ * @return 1-signatures
+ */
+ private ArrayList<Signature> constructOneSignatures(SetDBIDs[][] partitions, final long[][] markers) {
+ final int dim = partitions.length;
+ // Generate projected p-signature intervals.
+ ArrayList<Signature> signatures = new ArrayList<>();
+ for(int d = 0; d < dim; d++) {
+ final DBIDs[] parts = partitions[d];
+ if(parts == null) {
+ continue; // Never mark any on constant dimensions.
+ }
+ final long[] marked = markers[d];
+ // Find sequences of 1s in marked.
+ for(int start = BitsUtil.nextSetBit(marked, 0); start >= 0;) {
+ int end = BitsUtil.nextClearBit(marked, start + 1);
+ end = (end == -1) ? dim : end;
+ int[] signature = new int[dim << 1];
+ Arrays.fill(signature, -1);
+ signature[d << 1] = start;
+ signature[(d << 1) + 1] = end - 1; // inclusive
+ HashSetModifiableDBIDs sids = unionDBIDs(parts, start, end /* exclusive */);
+ if(LOG.isDebugging()) {
+ LOG.debug("1-signature: " + d + " " + start + "-" + (end - 1));
+ }
+ signatures.add(new Signature(signature, sids));
+ start = (end < dim) ? BitsUtil.nextSetBit(marked, end + 1) : -1;
+ }
+ }
+ return signatures;
+ }
+
+ /**
+ * Merge 1-signatures into p-signatures.
+ *
+ * @param binCount Number of bins in each dimension.
+ * @param signatures 1-signatures
+ * @return p-signatures
+ */
+ private ArrayList<Signature> mergeClusterCores(final int binCount, ArrayList<Signature> signatures) {
+ MutableProgress mergeProgress = LOG.isVerbose() ? new MutableProgress("Merging signatures.", signatures.size(), LOG) : null;
+
+ // Annotate dimensions to 1-signatures for quick stopping.
+ int[] firstdim = new int[signatures.size()];
+ for(int i = 0; i < signatures.size(); i++) {
+ firstdim[i] = signatures.get(i).getFirstDim();
+ }
+ LOG.debug("First dimensions: " + FormatUtil.format(firstdim));
+
+ // Merge to (p+1)-signatures (cluster cores).
+ ArrayList<Signature> clusterCores = new ArrayList<>(signatures);
+ // Try adding merge 1-signature with each cluster core.
+ for(int i = 0; i < clusterCores.size(); i++) {
+ final Signature parent = clusterCores.get(i);
+ final int end = parent.getFirstDim();
+ for(int j = 0; j < signatures.size() && firstdim[j] < end; j++) {
+ final Signature onesig = signatures.get(j);
+ final Signature merge = mergeSignatures(parent, onesig, binCount);
+ if(merge != null) {
+ // We add each potential core to the list to allow remaining
+ // 1-signatures to try merging with this p-signature as well.
+ clusterCores.add(merge);
+ // Flag both "parents" for removal.
+ parent.prune = true;
+ onesig.prune = true;
+ }
+ }
+ if(mergeProgress != null) {
+ mergeProgress.setTotal(clusterCores.size());
+ mergeProgress.incrementProcessed(LOG);
+ }
+ }
+ if(mergeProgress != null) {
+ mergeProgress.setProcessed(mergeProgress.getTotal(), LOG);
+ }
+ return clusterCores;
+ }
+
+ private ArrayList<Signature> pruneRedundantClusterCores(ArrayList<Signature> clusterCores) {
+ // Prune cluster cores based on Definition 3, Condition 2.
+ ArrayList<Signature> retain = new ArrayList<>(clusterCores.size());
+ outer: for(Signature clusterCore : clusterCores) {
+ if(clusterCore.prune) {
+ continue;
+ }
+ for(int k = 0; k < clusterCores.size(); k++) {
+ Signature other = clusterCores.get(k);
+ if(other != clusterCore) {
+ if(other.isSuperset(clusterCore)) {
+ continue outer;
+ }
+ }
+ }
+ if(LOG.isDebugging()) {
+ LOG.debug("Retained cluster core: " + clusterCore);
+ }
+ retain.add(clusterCore);
+ }
+ clusterCores = retain;
+ return clusterCores;
+ }
+
+ /**
+ * Partition the data set into {@code bins} bins in each dimension
+ * <i>independently</i>.
+ *
+ * This can be used to construct a grid approximation of the data using O(d n)
+ * memory.
+ *
+ * When a dimension is found to be constant, it will not be partitioned, but
+ * instead the corresponding array will be set to {@code null}.
+ *
+ * @param relation Data relation to partition
+ * @param bins Number of bins
+ * @return Partitions of each dimension.
+ */
+ private SetDBIDs[][] partitionData(final Relation<V> relation, final int bins) {
+ final int dim = RelationUtil.dimensionality(relation);
+ SetDBIDs[][] partitions = new SetDBIDs[dim][bins];
+ ArrayModifiableDBIDs ids = DBIDUtil.newArray(relation.getDBIDs());
+ DBIDArrayIter iter = ids.iter(); // will be reused.
+ SortDBIDsBySingleDimension sorter = new VectorUtil.SortDBIDsBySingleDimension(relation, 0);
+ for(int d = 0; d < dim; d++) {
+ sorter.setDimension(d);
+ ids.sort(sorter);
+ // Minimum:
+ iter.seek(0);
+ double min = relation.get(iter).doubleValue(d);
+ // Extend:
+ iter.seek(ids.size() - 1);
+ double delta = (relation.get(iter).doubleValue(d) - min) / bins;
+ if(delta > 0.) {
+ SetDBIDs[] dimparts = partitions[d];
+ double split = min + delta;
+ HashSetModifiableDBIDs pids = DBIDUtil.newHashSet();
+ dimparts[0] = pids;
+ int i = 0;
+ for(iter.seek(0); iter.valid(); iter.advance()) {
+ final double v = relation.get(iter).doubleValue(d);
+ if(v <= split || i == dimparts.length - 1) {
+ pids.add(iter);
+ }
+ else {
+ i++;
+ split += delta;
+ pids = DBIDUtil.newHashSet();
+ dimparts[i] = pids;
+ }
+ }
+ for(++i; i < dimparts.length; ++i) {
+ dimparts[i] = pids;
+ }
+ }
+ else {
+ partitions[d] = null; // Flag whole dimension as bad
+ }
+ }
+ return partitions;
+ }
+
+ /**
+ * Compute the union of multiple DBID sets.
+ *
+ * @param parts Parts array
+ * @param start Array start index
+ * @param end Array end index (exclusive)
+ * @return
+ */
+ protected HashSetModifiableDBIDs unionDBIDs(final DBIDs[] parts, int start, int end) {
+ int sum = 0;
+ for(int i = start; i < end; i++) {
+ sum += parts[i].size();
+ }
+ HashSetModifiableDBIDs sids = DBIDUtil.newHashSet(sum);
+ for(int i = start; i < end; i++) {
+ sids.addDBIDs(parts[i]);
+ }
+ return sids;
+ }
+
+ /**
+ * Performs a ChiSquared test to determine whether an attribute has a uniform
+ * distribution.
+ *
+ * @param parts Data partitions.
+ * @param marked the marked bins that should be ignored.
+ * @param card Cardinality
+ * @return Position of maximum, or -1 when uniform.
+ */
+ private int chiSquaredUniformTest(SetDBIDs[] parts, long[] marked, int card) {
+ // Remaining number of bins.
+ final int binCount = parts.length - card;
+ // Get global mean over all unmarked bins.
+ int max = 0, maxpos = -1;
+ MeanVariance mv = new MeanVariance();
+ for(int i = 0; i < parts.length; i++) {
+ // Ignore already marked bins.
+ if(BitsUtil.get(marked, i)) {
+ continue;
+ }
+ final int binSupport = parts[i].size();
+ mv.put(binSupport);
+ if(binSupport > max) {
+ max = binSupport;
+ maxpos = i;
+ }
+ }
+ if(mv.getCount() < 1. || !(mv.getNaiveVariance() > 0.)) {
+ return -1;
+ }
+ // ChiSquare statistic is the naive variance of the sizes!
+ final double chiSquare = mv.getNaiveVariance() / mv.getMean();
+ final double test = ChiSquaredDistribution.cdf(chiSquare, Math.max(1, binCount - card - 1));
+ if((1. - alpha) < test) {
+ return maxpos;
+ }
+ return -1;
+ }
+
+ /**
+ * Computes a fuzzy membership with the weights based on which cluster cores
+ * each data point is part of.
+ *
+ * @param relation Data relation
+ * @param clusterCores the cluster cores.
+ * @param unassigned set to which to add unassigned points.
+ * @param probClusterIGivenX Membership probabilities.
+ * @param clusterWeights Cluster weights
+ */
+ private void computeFuzzyMembership(Relation<V> relation, ArrayList<Signature> clusterCores, ModifiableDBIDs unassigned, WritableDataStore<double[]> probClusterIGivenX, double[] clusterWeights) {
+ final int n = relation.size();
+ final int k = clusterCores.size();
+
+ for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
+ int count = 0;
+ double[] weights = new double[k];
+ for(int cluster = 0; cluster < k; ++cluster) {
+ if(clusterCores.get(cluster).ids.contains(iter)) {
+ weights[cluster] = 1.;
+ ++count;
+ }
+ }
+
+ // Set value(s) in membership matrix.
+ if(count > 0) {
+ // Rescale.
+ VMath.timesEquals(weights, 1. / count);
+ VMath.plusTimesEquals(clusterWeights, weights, 1. / n);
+ }
+ else {
+ // Does not match any cluster, mark it.
+ unassigned.add(iter);
+ }
+ probClusterIGivenX.put(iter, weights);
+ }
+ }
+
+ /**
+ * Assign unassigned objects to best candidate based on shortest Mahalanobis
+ * distance.
+ *
+ * @param relation Data relation
+ * @param probClusterIGivenX fuzzy membership matrix.
+ * @param means Cluster means.
+ * @param invCovMatr Cluster covariance matrices.
+ * @param clusterWeights
+ * @param assigned mapping of matrix row to DBID.
+ * @param unassigned the list of points not yet assigned.
+ */
+ private void assignUnassigned(Relation<V> relation, WritableDataStore<double[]> probClusterIGivenX, Vector[] means, Matrix[] invCovMatr, double[] clusterWeights, ModifiableDBIDs unassigned) {
+ if(unassigned.size() == 0) {
+ return;
+ }
+ final int k = means.length;
+ double pweight = 1. / relation.size();
+
+ for(DBIDIter iter = unassigned.iter(); iter.valid(); iter.advance()) {
+ // Find the best matching known cluster core using the Mahalanobis
+ // distance.
+ Vector v = relation.get(iter).getColumnVector();
+ int bestCluster = -1;
+ double minDistance = Double.POSITIVE_INFINITY;
+ for(int c = 0; c < k; ++c) {
+ final double distance = MathUtil.mahalanobisDistance(invCovMatr[c], v.minus(means[c]));
+ if(distance < minDistance) {
+ minDistance = distance;
+ bestCluster = c;
+ }
+ }
+ // Assign to best core.
+ double[] weights = new double[k];
+ weights[bestCluster] = 1.0;
+ clusterWeights[bestCluster] += pweight;
+ probClusterIGivenX.put(iter, weights);
+ }
+
+ // Clear the list of unassigned objects.
+ unassigned.clear();
+ }
+
+ /**
+ * Creates a hard clustering from the specified soft membership matrix.
+ *
+ * @param probClusterIGivenX the membership matrix.
+ * @param dbids mapping matrix row to DBID.
+ * @return a hard clustering based on the matrix.
+ */
+ private ArrayList<ClusterCandidate> hardClustering(WritableDataStore<double[]> probClusterIGivenX, List<Signature> clusterCores, DBIDs dbids) {
+ final int k = clusterCores.size();
+
+ // Initialize cluster sets.
+ ArrayList<ClusterCandidate> candidates = new ArrayList<>();
+ for(Signature sig : clusterCores) {
+ candidates.add(new ClusterCandidate(sig));
+ }
+
+ // Perform hard partitioning, assigning each data point only to one cluster,
+ // namely that one it is most likely to belong to.
+ for(DBIDIter iter = dbids.iter(); iter.valid(); iter.advance()) {
+ final double[] probs = probClusterIGivenX.get(iter);
+ int bestCluster = 0;
+ double bestProbability = probs[0];
+ for(int c = 1; c < k; ++c) {
+ if(probs[c] > bestProbability) {
+ bestCluster = c;
+ bestProbability = probs[c];
+ }
+ }
+ candidates.get(bestCluster).ids.add(iter);
+ }
+
+ return candidates;
+ }
+
+ /**
+ * Performs outlier detection by testing the Mahalanobis distance of each
+ * point in a cluster against the critical value of the ChiSquared
+ * distribution with as many degrees of freedom as the cluster has relevant
+ * attributes.
+ *
+ * @param relation Data relation
+ * @param means Cluster means
+ * @param invCovMatr Inverse covariance matrixes
+ * @param clusterCandidates the list of clusters to check.
+ * @param nonUniformDimensionCount the number of dimensions to consider when
+ * testing.
+ * @param noise the set to which to add points deemed outliers.
+ */
+ private void findOutliers(Relation<V> relation, Vector[] means, Matrix[] invCovMatr, ArrayList<ClusterCandidate> clusterCandidates, int nonUniformDimensionCount, ModifiableDBIDs noise) {
+ final int k = clusterCandidates.size();
+
+ for(int c = 0; c < k; ++c) {
+ final ClusterCandidate candidate = clusterCandidates.get(c);
+ if(candidate.ids.size() < 2) {
+ continue;
+ }
+ final int dof = candidate.dimensions.cardinality();
+ final double threshold = ChiSquaredDistribution.quantile(1 - .001, dof);
+ for(DBIDMIter iter = candidate.ids.iter(); iter.valid(); iter.advance()) {
+ final Vector mean = means[c];
+ final Vector delta = relation.get(iter).getColumnVector().minusEquals(mean);
+ final Matrix invCov = invCovMatr[c];
+ final double distance = MathUtil.mahalanobisDistance(invCov, delta);
+ if(distance >= threshold) {
+ // Outlier, remove it and add it to the outlier set.
+ noise.add(iter);
+ iter.remove();
+ }
+ }
+ }
+ }
+
+ /**
+ * Generates a merged signature of this and another one, where the other
+ * signature must be a 1-signature.
+ *
+ * @param first First signature.
+ * @param second Second signature, must be a 1-signature.
+ * @param numBins Number of bins per dimension.
+ * @return the merged signature, or null if the merge failed.
+ */
+ protected Signature mergeSignatures(Signature first, Signature second, int numBins) {
+ int d2 = -1;
+ for(int i = 0; i < second.spec.length; i += 2) {
+ if(second.spec[i] >= 0) {
+ assert (d2 == -1) : "Merging with non-1-signature?!?";
+ d2 = i;
+ }
+ }
+ assert (d2 >= 0) : "Merging with empty signature?";
+
+ // Avoid generating redundant signatures.
+ if(first.spec[d2] >= 0) {
+ return null;
+ }
+
+ // Definition 3, Condition 1:
+ // True support:
+ final ModifiableDBIDs intersection = DBIDUtil.intersection(first.ids, second.ids);
+ final int support = intersection.size();
+ // Interval width, computed using selected number of bins / total bins
+ double width = (second.spec[d2 + 1] - second.spec[d2] + 1.) / (double) numBins;
+ // Expected size thus:
+ double expect = first.ids.size() * width;
+ if(support <= expect || support < minClusterSize) {
+ return null;
+ }
+ final double test = PoissonDistribution.rawProbability(support, expect);
+ if((poissonThreshold) <= test) {
+ return null;
+ }
+ // Create merged signature.
+ int[] spec = first.spec.clone();
+ spec[d2] = second.spec[d2];
+ spec[d2 + 1] = second.spec[d2];
+
+ final Signature newsig = new Signature(spec, intersection);
+ if(LOG.isDebugging()) {
+ LOG.debug(newsig.toString());
+ }
+ return newsig;
+ }
+
+ /**
+ * P3C Cluster signature.
+ *
+ * @author Erich Schubert
+ */
+ private static class Signature {
+ /**
+ * Subspace specification
+ */
+ int[] spec;
+
+ /**
+ * Object ids.
+ */
+ DBIDs ids;
+
+ /**
+ * Pruning flag.
+ */
+ boolean prune = false;
+
+ /**
+ * Constructor.
+ *
+ * @param spec Subspace specification
+ * @param ids IDs.
+ */
+ private Signature(int[] spec, DBIDs ids) {
+ super();
+ this.spec = spec;
+ this.ids = ids;
+ }
+
+ /**
+ * Test whether this is a superset of the other signature.
+ *
+ * @param other Other signature.
+ * @return {@code true} when this is a superset.
+ */
+ public boolean isSuperset(Signature other) {
+ for(int i = 0; i < spec.length; i += 2) {
+ if(spec[i] != other.spec[i] || spec[i + 1] != other.spec[i]) {
+ if(other.spec[i] != -1) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Find the first dimension set in this signature.
+ *
+ * @return Dimension
+ */
+ public int getFirstDim() {
+ for(int i = 0; i < spec.length; i += 2) {
+ if(spec[i] >= 0) {
+ return (i >>> 1);
+ }
+ }
+ return -1;
+ }
+
+ @Override
+ public String toString() {
+ int p = 0;
+ for(int i = 0; i < spec.length; i += 2) {
+ if(spec[i] >= 0) {
+ p++;
+ }
+ }
+ StringBuilder buf = new StringBuilder();
+ buf.append(p).append("-signature: ");
+ for(int i = 0; i < spec.length; i += 2) {
+ if(spec[i] >= 0) {
+ buf.append(i >>> 1).append(':');
+ buf.append(spec[i]).append('-').append(spec[i + 1]).append(' ');
+ }
+ }
+ buf.append(" size: ").append(ids.size());
+ return buf.toString();
+ }
+ }
+
+ /**
+ * This class is used to represent potential clusters.
+ *
+ * @author Erich Schubert
+ */
+ private static class ClusterCandidate {
+ /**
+ * Selected dimensions
+ */
+ public final BitSet dimensions;
+
+ /**
+ * Objects contained in cluster.
+ */
+ public final ModifiableDBIDs ids;
+
+ /**
+ * Constructor.
+ *
+ * @param clusterCore Signature
+ */
+ public ClusterCandidate(Signature clusterCore) {
+ this.dimensions = new BitSet(clusterCore.spec.length >> 1);
+ for(int i = 0; i < clusterCore.spec.length; i += 2) {
+ this.dimensions.set(i >> 1);
+ }
+ this.ids = DBIDUtil.newArray(clusterCore.ids.size());
+ }
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Florian Nuecke
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ /**
+ * Parameter for the chi squared test threshold.
+ */
+ public static final OptionID ALPHA_THRESHOLD_ID = new OptionID("p3c.alpha", "The significance level for uniform testing in the initial binning step.");
+
+ /**
+ * Parameter for the poisson test threshold.
+ */
+ public static final OptionID POISSON_THRESHOLD_ID = new OptionID("p3c.threshold", "The threshold value for the poisson test used when merging signatures.");
+
+ /**
+ * Maximum number of iterations for the EM step.
+ */
+ public static final OptionID MAX_EM_ITERATIONS_ID = new OptionID("p3c.em.maxiter", "The maximum number of iterations for the EM step. Use -1 to run until delta convergence.");
+
+ /**
+ * Threshold when to stop EM iterations.
+ */
+ public static final OptionID EM_DELTA_ID = new OptionID("p3c.em.delta", "The change delta for the EM step below which to stop.");
+
+ /**
+ * Minimum cluster size for noise flagging. (Not existant in the original
+ * publication).
+ */
+ public static final OptionID MIN_CLUSTER_SIZE_ID = new OptionID("p3c.minsize", "The minimum size of a cluster, otherwise it is seen as noise (this is a cheat, it is not mentioned in the paper).");
+
+ /**
+ * Parameter for the chi squared test threshold.
+ *
+ * While statistical values such as 0.01 are a good choice, we found the
+ * need to modify this parameter in our experiments.
+ */
+ protected double alpha;
+
+ /**
+ * Parameter for the poisson test threshold.
+ */
+ protected double poissonThreshold;
+
+ /**
+ * Maximum number of iterations for the EM step.
+ */
+ protected int maxEmIterations;
+
+ /**
+ * Threshold when to stop EM iterations.
+ */
+ protected double emDelta;
+
+ /**
+ * Minimum cluster size for noise flagging. (Not existant in the original
+ * publication).
+ */
+ protected int minClusterSize;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+
+ {
+ DoubleParameter param = new DoubleParameter(ALPHA_THRESHOLD_ID, .001);
+ param.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ param.addConstraint(CommonConstraints.LESS_THAN_HALF_DOUBLE);
+ if(config.grab(param)) {
+ alpha = param.getValue();
+ }
+ }
+
+ {
+ DoubleParameter param = new DoubleParameter(POISSON_THRESHOLD_ID, 1.e-4);
+ param.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ param.addConstraint(CommonConstraints.LESS_THAN_HALF_DOUBLE);
+ if(config.grab(param)) {
+ poissonThreshold = param.getValue();
+ }
+ }
+
+ {
+ IntParameter param = new IntParameter(MAX_EM_ITERATIONS_ID, 20);
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_MINUSONE_INT);
+ if(config.grab(param)) {
+ maxEmIterations = param.getValue();
+ }
+ }
+
+ {
+ DoubleParameter param = new DoubleParameter(EM_DELTA_ID, 1.e-5);
+ param.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ if(config.grab(param)) {
+ emDelta = param.getValue();
+ }
+ }
+
+ {
+ IntParameter param = new IntParameter(MIN_CLUSTER_SIZE_ID, 1);
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(param)) {
+ minClusterSize = param.getValue();
+ }
+ }
+ }
+
+ @Override
+ protected P3C<V> makeInstance() {
+ return new P3C<>(alpha, poissonThreshold, maxEmIterations, emDelta, minClusterSize);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java index 92158734..03e9978f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java @@ -67,7 +67,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; @@ -148,7 +148,7 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) { DistanceQuery<V, DoubleDistance> distFunc = this.getDistanceQuery(database); RangeQuery<V, DoubleDistance> rangeQuery = database.getRangeQuery(distFunc); - final Random random = rnd.getRandom(); + final Random random = rnd.getSingleThreadedRandom(); if (RelationUtil.dimensionality(relation) < l) { throw new IllegalStateException("Dimensionality of data < parameter l! " + "(" + RelationUtil.dimensionality(relation) + " < " + l + ")"); @@ -844,7 +844,7 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster configL(config); IntParameter m_iP = new IntParameter(M_I_ID, 10); - m_iP.addConstraint(new GreaterConstraint(0)); + m_iP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if (config.grab(m_iP)) { m_i = m_iP.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java index c8d0833e..e6245f6e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java @@ -54,7 +54,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -77,7 +77,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * @author Elke Achtert * * @apiviz.uses DBSCAN - * @apiviz.uses AbstractDimensionsSelectingDoubleDistanceFunction + * @apiviz.uses DimensionSelectingSubspaceDistanceFunction * @apiviz.has SubspaceModel * * @param <V> the type of FeatureVector handled by this Algorithm @@ -488,7 +488,7 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster } IntParameter minptsP = new IntParameter(MINPTS_ID); - minptsP.addConstraint(new GreaterConstraint(0)); + minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if (config.grab(minptsP)) { minpts = minptsP.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java index ad0b8175..65447713 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java @@ -23,59 +23,45 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier; along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import java.util.HashMap; - -import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; -import de.lmu.ifi.dbs.elki.database.QueryUtil; +import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; -import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; -import de.lmu.ifi.dbs.elki.database.ids.DBIDRange; import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; -import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDPair; -import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.ids.distance.KNNList; -import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; +import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; -import de.lmu.ifi.dbs.elki.distance.similarityfunction.PrimitiveSimilarityFunction; +import de.lmu.ifi.dbs.elki.distance.similarityfunction.SimilarityFunction; import de.lmu.ifi.dbs.elki.distance.similarityfunction.kernel.KernelMatrix; import de.lmu.ifi.dbs.elki.distance.similarityfunction.kernel.PolynomialKernelFunction; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; import de.lmu.ifi.dbs.elki.math.MeanVariance; -import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta; import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; -import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ComparableMaxHeap; -import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ComparableMinHeap; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; -import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** - * Angle-Based Outlier Detection + * Angle-Based Outlier Detection / Angle-Based Outlier Factor. * * Outlier detection using variance analysis on angles, especially for high - * dimensional data sets. + * dimensional data sets. Exact version, which has cubic runtime (see also + * {@link FastABOD} and {@link LBABOD} for faster versions). * * H.-P. Kriegel, M. Schubert, and A. Zimek: Angle-Based Outlier Detection in * High-dimensional Data. In: Proc. 14th ACM SIGKDD Int. Conf. on Knowledge @@ -84,475 +70,107 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * @author Matthias Schubert (Original Code) * @author Erich Schubert (ELKIfication) * - * @apiviz.has KNNQuery - * * @param <V> Vector type */ @Title("ABOD: Angle-Based Outlier Detection") @Description("Outlier detection using variance analysis on angles, especially for high dimensional data sets.") @Reference(authors = "H.-P. Kriegel, M. Schubert, and A. Zimek", title = "Angle-Based Outlier Detection in High-dimensional Data", booktitle = "Proc. 14th ACM SIGKDD Int. Conf. on Knowledge Discovery and Data Mining (KDD '08), Las Vegas, NV, 2008", url = "http://dx.doi.org/10.1145/1401890.1401946") -public class ABOD<V extends NumberVector<?>> extends AbstractDistanceBasedAlgorithm<V, DoubleDistance, OutlierResult> implements OutlierAlgorithm { +public class ABOD<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierResult> implements OutlierAlgorithm { /** * The logger for this class. */ private static final Logging LOG = Logging.getLogger(ABOD.class); /** - * Parameter for k, the number of neighbors used in kNN queries. - */ - public static final OptionID K_ID = new OptionID("abod.k", "Parameter k for kNN queries."); - - /** - * Parameter for sample size to be used in fast mode. - */ - public static final OptionID FAST_SAMPLE_ID = new OptionID("abod.samplesize", "Sample size to enable fast mode."); - - /** - * Parameter for the kernel function. - */ - public static final OptionID KERNEL_FUNCTION_ID = new OptionID("abod.kernelfunction", "Kernel function to use."); - - /** - * The preprocessor used to materialize the kNN neighborhoods. - */ - public static final OptionID PREPROCESSOR_ID = new OptionID("abod.knnquery", "Processor to compute the kNN neighborhoods."); - - /** - * use alternate code below. - */ - private static final boolean USE_RND_SAMPLE = false; - - /** - * k parameter. - */ - private int k; - - /** - * Variable to store fast mode sampling value. - */ - int sampleSize = 0; - - /** * Store the configured Kernel version. */ - private PrimitiveSimilarityFunction<? super V, DoubleDistance> primitiveKernelFunction; - - /** - * Static DBID map. - */ - private ArrayDBIDs staticids = null; + protected SimilarityFunction<? super V, DoubleDistance> kernelFunction; /** - * Actual constructor, with parameters. Fast mode (sampling). + * Constructor for Angle-Based Outlier Detection (ABOD). * - * @param k k parameter - * @param sampleSize sample size - * @param primitiveKernelFunction Kernel function to use - * @param distanceFunction Distance function + * @param kernelFunction kernel function to use */ - public ABOD(int k, int sampleSize, PrimitiveSimilarityFunction<? super V, DoubleDistance> primitiveKernelFunction, DistanceFunction<V, DoubleDistance> distanceFunction) { - super(distanceFunction); - this.k = k; - this.sampleSize = sampleSize; - this.primitiveKernelFunction = primitiveKernelFunction; + public ABOD(SimilarityFunction<? super V, DoubleDistance> kernelFunction) { + super(); + this.kernelFunction = kernelFunction; } /** - * Actual constructor, with parameters. Slow mode (exact). + * Run ABOD on the data set. * - * @param k k parameter - * @param primitiveKernelFunction kernel function to use - * @param distanceFunction Distance function + * @param relation Relation to process + * @return Outlier detection result */ - public ABOD(int k, PrimitiveSimilarityFunction<? super V, DoubleDistance> primitiveKernelFunction, DistanceFunction<V, DoubleDistance> distanceFunction) { - super(distanceFunction); - this.k = k; - this.sampleSize = 0; - this.primitiveKernelFunction = primitiveKernelFunction; - } + public OutlierResult run(Database db, Relation<V> relation) { + DBIDs ids = relation.getDBIDs(); + // Build a kernel matrix, to make O(n^3) slightly less bad. + SimilarityQuery<V, DoubleDistance> sq = db.getSimilarityQuery(relation, kernelFunction); + KernelMatrix kernelMatrix = new KernelMatrix(sq, relation, ids); - /** - * Main part of the algorithm. Exact version. - * - * @param relation Relation to query - * @return result - */ - public OutlierResult getRanking(Relation<V> relation) { - // Fix a static set of IDs - if (relation.getDBIDs() instanceof DBIDRange) { - staticids = (DBIDRange) relation.getDBIDs(); - } else { - staticids = DBIDUtil.newArray(relation.getDBIDs()); - ((ArrayModifiableDBIDs) staticids).sort(); - } - - KernelMatrix kernelMatrix = new KernelMatrix(primitiveKernelFunction, relation, staticids); - ComparableMaxHeap<DoubleDBIDPair> pq = new ComparableMaxHeap<>(relation.size()); - - // preprocess kNN neighborhoods - KNNQuery<V, DoubleDistance> knnQuery = QueryUtil.getKNNQuery(relation, getDistanceFunction(), k); + WritableDoubleDataStore abodvalues = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC); + DoubleMinMax minmaxabod = new DoubleMinMax(); MeanVariance s = new MeanVariance(); - for (DBIDIter objKey = relation.iterDBIDs(); objKey.valid(); objKey.advance()) { - s.reset(); - - KNNList<DoubleDistance> neighbors = knnQuery.getKNNForDBID(objKey, k); - for (DBIDIter key1 = neighbors.iter(); key1.valid(); key1.advance()) { - for (DBIDIter key2 = neighbors.iter(); key2.valid(); key2.advance()) { - if (DBIDUtil.equal(key2, key1) || DBIDUtil.equal(key1, objKey) || DBIDUtil.equal(key2, objKey)) { - continue; - } - double nenner = calcDenominator(kernelMatrix, objKey, key1, key2); - - if (nenner != 0) { - double sqrtnenner = Math.sqrt(nenner); - double tmp = calcNumerator(kernelMatrix, objKey, key1, key2) / nenner; - s.put(tmp, 1 / sqrtnenner); - } - - } - } - // Sample variance probably would be correct, however the numerical - // instabilities can actually break ABOD here. - pq.add(DBIDUtil.newPair(s.getNaiveVariance(), objKey)); + for (DBIDIter pA = ids.iter(); pA.valid(); pA.advance()) { + final double abof = computeABOF(relation, kernelMatrix, pA, s); + minmaxabod.put(abof); + abodvalues.putDouble(pA, abof); } - DoubleMinMax minmaxabod = new DoubleMinMax(); - WritableDoubleDataStore abodvalues = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); - while (!pq.isEmpty()) { - DoubleDBIDPair pair = pq.poll(); - abodvalues.putDouble(pair, pair.doubleValue()); - minmaxabod.put(pair.doubleValue()); - } // Build result representation. - Relation<Double> scoreResult = new MaterializedRelation<>("Angle-based Outlier Degree", "abod-outlier", TypeUtil.DOUBLE, abodvalues, relation.getDBIDs()); + Relation<Double> scoreResult = new MaterializedRelation<>("Angle-Based Outlier Degree", "abod-outlier", TypeUtil.DOUBLE, abodvalues, relation.getDBIDs()); OutlierScoreMeta scoreMeta = new InvertedOutlierScoreMeta(minmaxabod.getMin(), minmaxabod.getMax(), 0.0, Double.POSITIVE_INFINITY); return new OutlierResult(scoreMeta, scoreResult); } /** - * Main part of the algorithm. Fast version. + * Compute the exact ABOF value. * - * @param relation Relation to use - * @return result - */ - public OutlierResult getFastRanking(Relation<V> relation) { - final DBIDs ids = relation.getDBIDs(); - // Fix a static set of IDs - // TODO: add a DBIDUtil.ensureSorted? - if (relation.getDBIDs() instanceof DBIDRange) { - staticids = (DBIDRange) relation.getDBIDs(); - } else { - staticids = DBIDUtil.newArray(relation.getDBIDs()); - ((ArrayModifiableDBIDs) staticids).sort(); - } - - KernelMatrix kernelMatrix = new KernelMatrix(primitiveKernelFunction, relation, staticids); - - ComparableMaxHeap<DoubleDBIDPair> pq = new ComparableMaxHeap<>(relation.size()); - // get Candidate Ranking - for (DBIDIter aKey = relation.iterDBIDs(); aKey.valid(); aKey.advance()) { - WritableDoubleDataStore dists = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT); - // determine kNearestNeighbors and pairwise distances - ComparableMinHeap<DoubleDBIDPair> nn; - if (!USE_RND_SAMPLE) { - nn = calcDistsandNN(relation, kernelMatrix, sampleSize, aKey, dists); - } else { - // alternative: - nn = calcDistsandRNDSample(relation, kernelMatrix, sampleSize, aKey, dists); - } - - // get normalization - double[] counter = calcFastNormalization(aKey, dists, staticids); - // umsetzen von Pq zu list - ModifiableDBIDs neighbors = DBIDUtil.newArray(nn.size()); - while (!nn.isEmpty()) { - neighbors.add(nn.poll()); - } - // getFilter - double var = getAbofFilter(kernelMatrix, aKey, dists, counter[1], counter[0], neighbors); - pq.add(DBIDUtil.newPair(var, aKey)); - } - // refine Candidates - ComparableMinHeap<DoubleDBIDPair> resqueue = new ComparableMinHeap<>(k); - MeanVariance s = new MeanVariance(); - while (!pq.isEmpty()) { - if (resqueue.size() == k && pq.peek().doubleValue() > resqueue.peek().doubleValue()) { - break; - } - // double approx = pq.peek().getFirst(); - DBIDRef aKey = pq.poll(); - s.reset(); - for (DBIDIter bKey = relation.iterDBIDs(); bKey.valid(); bKey.advance()) { - if (DBIDUtil.equal(bKey, aKey)) { - continue; - } - for (DBIDIter cKey = relation.iterDBIDs(); cKey.valid(); cKey.advance()) { - if (DBIDUtil.equal(cKey, aKey)) { - continue; - } - // double nenner = dists[y]*dists[z]; - double nenner = calcDenominator(kernelMatrix, aKey, bKey, cKey); - if (nenner != 0) { - double tmp = calcNumerator(kernelMatrix, aKey, bKey, cKey) / nenner; - double sqrtNenner = Math.sqrt(nenner); - s.put(tmp, 1 / sqrtNenner); - } - } - } - double var = s.getSampleVariance(); - if (resqueue.size() < k) { - resqueue.add(DBIDUtil.newPair(var, aKey)); - } else { - if (resqueue.peek().doubleValue() > var) { - resqueue.replaceTopElement(DBIDUtil.newPair(var, aKey)); - } - } - - } - DoubleMinMax minmaxabod = new DoubleMinMax(); - WritableDoubleDataStore abodvalues = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC); - while (!pq.isEmpty()) { - DoubleDBIDPair pair = pq.poll(); - abodvalues.putDouble(pair, pair.doubleValue()); - minmaxabod.put(pair.doubleValue()); - } - // Build result representation. - Relation<Double> scoreResult = new MaterializedRelation<>("Angle-based Outlier Detection", "abod-outlier", TypeUtil.DOUBLE, abodvalues, ids); - OutlierScoreMeta scoreMeta = new InvertedOutlierScoreMeta(minmaxabod.getMin(), minmaxabod.getMax(), 0.0, Double.POSITIVE_INFINITY); - return new OutlierResult(scoreMeta, scoreResult); - } - - private double[] calcFastNormalization(DBIDRef x, WritableDoubleDataStore dists, DBIDs ids) { - double[] result = new double[2]; - - double sum = 0; - double sumF = 0; - for (DBIDIter yKey = ids.iter(); yKey.valid(); yKey.advance()) { - if (dists.doubleValue(yKey) != 0) { - double tmp = 1 / Math.sqrt(dists.doubleValue(yKey)); - sum += tmp; - sumF += (1 / dists.doubleValue(yKey)) * tmp; - } - } - double sofar = 0; - double sofarF = 0; - for (DBIDIter zKey = ids.iter(); zKey.valid(); zKey.advance()) { - if (dists.doubleValue(zKey) != 0) { - double tmp = 1 / Math.sqrt(dists.doubleValue(zKey)); - sofar += tmp; - double rest = sum - sofar; - result[0] += tmp * rest; - - sofarF += (1 / dists.doubleValue(zKey)) * tmp; - double restF = sumF - sofarF; - result[1] += (1 / dists.doubleValue(zKey)) * tmp * restF; - } - } - return result; - } - - private double getAbofFilter(KernelMatrix kernelMatrix, DBIDRef aKey, WritableDoubleDataStore dists, double fulCounter, double counter, DBIDs neighbors) { - double sum = 0.0; - double sqrSum = 0.0; - double partCounter = 0; - for (DBIDIter bKey = neighbors.iter(); bKey.valid(); bKey.advance()) { - if (DBIDUtil.equal(bKey, aKey)) { + * @param relation Relation + * @param kernelMatrix Kernel matrix + * @param pA Object A to compute ABOF for + * @param s Statistics tracker + * @return ABOF value + */ + protected double computeABOF(Relation<V> relation, KernelMatrix kernelMatrix, DBIDRef pA, MeanVariance s) { + s.reset(); // Reused + double simAA = kernelMatrix.getSimilarity(pA, pA); + + for (DBIDIter nB = relation.iterDBIDs(); nB.valid(); nB.advance()) { + if (DBIDUtil.equal(nB, pA)) { continue; } - for (DBIDIter cKey = neighbors.iter(); cKey.valid(); cKey.advance()) { - if (DBIDUtil.equal(cKey, aKey)) { - continue; - } - if (DBIDUtil.compare(bKey, cKey) > 0) { - double nenner = dists.doubleValue(bKey) * dists.doubleValue(cKey); - if (nenner != 0) { - double tmp = calcNumerator(kernelMatrix, aKey, bKey, cKey) / nenner; - double sqrtNenner = Math.sqrt(nenner); - sum += tmp * (1 / sqrtNenner); - sqrSum += tmp * tmp * (1 / sqrtNenner); - partCounter += (1 / (sqrtNenner * nenner)); - } - } - } - } - // TODO: Document the meaning / use of fulCounter, partCounter. - double mu = (sum + (fulCounter - partCounter)) / counter; - return (sqrSum / counter) - (mu * mu); - } - - /** - * Compute the cosinus value between vectors aKey and bKey. - * - * @param kernelMatrix - * @param aKey - * @param bKey - * @return cosinus value - */ - private double calcCos(KernelMatrix kernelMatrix, DBIDRef aKey, DBIDRef bKey) { - final int ai = mapDBID(aKey); - final int bi = mapDBID(bKey); - return kernelMatrix.getDistance(ai, ai) + kernelMatrix.getDistance(bi, bi) - 2 * kernelMatrix.getDistance(ai, bi); - } - - private int mapDBID(DBIDRef aKey) { - // TODO: this is not the most efficient... - int off = staticids.binarySearch(aKey); - if (off < 0) { - throw new AbortException("Did not find id " + aKey.toString() + " in staticids. " + staticids.contains(aKey)); - } - return off + 1; - } - - private double calcDenominator(KernelMatrix kernelMatrix, DBIDRef aKey, DBIDRef bKey, DBIDRef cKey) { - return calcCos(kernelMatrix, aKey, bKey) * calcCos(kernelMatrix, aKey, cKey); - } - - private double calcNumerator(KernelMatrix kernelMatrix, DBIDRef aKey, DBIDRef bKey, DBIDRef cKey) { - final int ai = mapDBID(aKey); - final int bi = mapDBID(bKey); - final int ci = mapDBID(cKey); - return (kernelMatrix.getDistance(ai, ai) + kernelMatrix.getDistance(bi, ci) - kernelMatrix.getDistance(ai, ci) - kernelMatrix.getDistance(ai, bi)); - } - - private ComparableMinHeap<DoubleDBIDPair> calcDistsandNN(Relation<V> data, KernelMatrix kernelMatrix, int sampleSize, DBIDRef aKey, WritableDoubleDataStore dists) { - ComparableMinHeap<DoubleDBIDPair> nn = new ComparableMinHeap<>(sampleSize); - for (DBIDIter bKey = data.iterDBIDs(); bKey.valid(); bKey.advance()) { - double val = calcCos(kernelMatrix, aKey, bKey); - dists.putDouble(bKey, val); - if (nn.size() < sampleSize) { - nn.add(DBIDUtil.newPair(val, bKey)); - } else { - if (val < nn.peek().doubleValue()) { - nn.replaceTopElement(DBIDUtil.newPair(val, bKey)); - } - } - } - return nn; - } - - private ComparableMinHeap<DoubleDBIDPair> calcDistsandRNDSample(Relation<V> data, KernelMatrix kernelMatrix, int sampleSize, DBIDRef aKey, WritableDoubleDataStore dists) { - ComparableMinHeap<DoubleDBIDPair> nn = new ComparableMinHeap<>(sampleSize); - int step = (int) ((double) data.size() / (double) sampleSize); - int counter = 0; - for (DBIDIter bKey = data.iterDBIDs(); bKey.valid(); bKey.advance()) { - double val = calcCos(kernelMatrix, aKey, bKey); - dists.putDouble(bKey, val); - if (counter % step == 0) { - nn.add(DBIDUtil.newPair(val, bKey)); + double simBB = kernelMatrix.getSimilarity(nB, nB); + double simAB = kernelMatrix.getSimilarity(pA, nB); + double sqdAB = simAA + simBB - simAB - simAB; + if (!(sqdAB > 0.)) { + continue; } - counter++; - } - return nn; - } - - /** - * Get explanations for points in the database. - * - * @param data to get explanations for - * @return String explanation - */ - // TODO: this should be done by the result classes. - public String getExplanations(Relation<V> data) { - KernelMatrix kernelMatrix = new KernelMatrix(primitiveKernelFunction, data, staticids); - // PQ for Outlier Ranking - ComparableMaxHeap<DoubleDBIDPair> pq = new ComparableMaxHeap<>(data.size()); - HashMap<DBID, DBIDs> explaintab = new HashMap<>(); - // test all objects - MeanVariance s = new MeanVariance(), s2 = new MeanVariance(); - for (DBIDIter objKey = data.iterDBIDs(); objKey.valid(); objKey.advance()) { - s.reset(); - // Queue for the best explanation - ComparableMinHeap<DoubleDBIDPair> explain = new ComparableMinHeap<>(); - // determine Object - // for each pair of other objects - for (DBIDIter key1 = data.iterDBIDs(); key1.valid(); key1.advance()) { - // Collect Explanation Vectors - s2.reset(); - if (DBIDUtil.equal(objKey, key1)) { + for (DBIDIter nC = relation.iterDBIDs(); nC.valid(); nC.advance()) { + if (DBIDUtil.equal(nC, pA) || DBIDUtil.compare(nC, nB) < 0) { continue; } - for (DBIDIter key2 = data.iterDBIDs(); key2.valid(); key2.advance()) { - if (DBIDUtil.equal(key2, key1) || DBIDUtil.equal(objKey, key2)) { - continue; - } - double nenner = calcDenominator(kernelMatrix, objKey, key1, key2); - if (nenner != 0) { - double tmp = calcNumerator(kernelMatrix, objKey, key1, key2) / nenner; - double sqr = Math.sqrt(nenner); - s2.put(tmp, 1 / sqr); - } - } - explain.add(DBIDUtil.newPair(s2.getSampleVariance(), key1)); - s.put(s2); - } - // build variance of the observed vectors - pq.add(DBIDUtil.newPair(s.getSampleVariance(), objKey)); - // - ModifiableDBIDs expList = DBIDUtil.newArray(); - expList.add(explain.poll()); - while (!explain.isEmpty()) { - DBIDRef nextKey = explain.poll(); - if (DBIDUtil.equal(nextKey, objKey)) { + double simCC = kernelMatrix.getSimilarity(nC, nC); + double simAC = kernelMatrix.getSimilarity(pA, nC); + double sqdAC = simAA + simCC - simAC; + if (!(sqdAC > 0.)) { continue; } - double max = Double.MIN_VALUE; - for (DBIDIter exp = expList.iter(); exp.valid(); exp.advance()) { - if (DBIDUtil.equal(exp, objKey) || DBIDUtil.equal(nextKey, exp)) { - continue; - } - double nenner = Math.sqrt(calcCos(kernelMatrix, objKey, nextKey)) * Math.sqrt(calcCos(kernelMatrix, objKey, exp)); - double angle = calcNumerator(kernelMatrix, objKey, nextKey, exp) / nenner; - max = Math.max(angle, max); - } - if (max < 0.5) { - expList.add(nextKey); - } - } - explaintab.put(DBIDUtil.deref(objKey), expList); - } - StringBuilder buf = new StringBuilder(); - buf.append("Result: ABOD\n"); - int count = 0; - while (!pq.isEmpty()) { - if (count > 10) { - break; + // Exploit bilinearity of scalar product: + // <B-A, C-A> = <B, C-A> - <A,C-A> + // = <B,C> - <B,A> - <A,C> + <A,A> + // For computing variance, AA is a constant and can be ignored. + double simBC = kernelMatrix.getSimilarity(nB, nC); + double numerator = simBC - simAB - simAC; // + simAA; + double val = numerator / (sqdAB * sqdAC); + s.put(val, 1. / Math.sqrt(sqdAB * sqdAC)); } - double factor = pq.peek().doubleValue(); - DBIDRef key = pq.poll(); - buf.append(data.get(key)).append(' '); - buf.append(count).append(" Factor=").append(factor).append(' ').append(key).append('\n'); - DBIDs expList = explaintab.get(key); - generateExplanation(buf, data, key, expList); - count++; - } - return buf.toString(); - } - - private void generateExplanation(StringBuilder buf, Relation<V> data, DBIDRef key, DBIDs expList) { - Vector vect1 = data.get(key).getColumnVector(); - for (DBIDIter iter = expList.iter(); iter.valid(); iter.advance()) { - buf.append("Outlier: ").append(vect1).append('\n'); - Vector exp = data.get(iter).getColumnVector(); - buf.append("Most common neighbor: ").append(exp).append('\n'); - // determine difference Vector - Vector vals = exp.minus(vect1); - buf.append(vals).append('\n'); - } - } - - /** - * Run ABOD on the data set. - * - * @param relation Relation to process - * @return Outlier detection result - */ - public OutlierResult run(Relation<V> relation) { - if (sampleSize > 0) { - return getFastRanking(relation); - } else { - return getRanking(relation); } + // Sample variance probably would be correct, but the ABOD publication + // uses the naive variance. + final double abof = s.getNaiveVariance(); + return abof; } @Override @@ -572,45 +190,29 @@ public class ABOD<V extends NumberVector<?>> extends AbstractDistanceBasedAlgori * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<V, DoubleDistance> { - /** - * k Parameter. - */ - protected int k = 0; - + public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer { /** - * Sample size. + * Parameter for the kernel function. */ - protected int sampleSize = 0; + public static final OptionID KERNEL_FUNCTION_ID = new OptionID("abod.kernelfunction", "Kernel function to use."); /** * Distance function. */ - protected PrimitiveSimilarityFunction<V, DoubleDistance> primitiveKernelFunction = null; + protected SimilarityFunction<V, DoubleDistance> kernelFunction = null; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - final IntParameter kP = new IntParameter(K_ID, 30); - kP.addConstraint(new GreaterEqualConstraint(1)); - if (config.grab(kP)) { - k = kP.getValue(); - } - final IntParameter sampleSizeP = new IntParameter(FAST_SAMPLE_ID); - sampleSizeP.addConstraint(new GreaterEqualConstraint(1)); - sampleSizeP.setOptional(true); - if (config.grab(sampleSizeP)) { - sampleSize = sampleSizeP.getValue(); - } - final ObjectParameter<PrimitiveSimilarityFunction<V, DoubleDistance>> param = new ObjectParameter<>(KERNEL_FUNCTION_ID, PrimitiveSimilarityFunction.class, PolynomialKernelFunction.class); + final ObjectParameter<SimilarityFunction<V, DoubleDistance>> param = new ObjectParameter<>(KERNEL_FUNCTION_ID, SimilarityFunction.class, PolynomialKernelFunction.class); if (config.grab(param)) { - primitiveKernelFunction = param.instantiateClass(config); + kernelFunction = param.instantiateClass(config); } } @Override protected ABOD<V> makeInstance() { - return new ABOD<>(k, sampleSize, primitiveKernelFunction, distanceFunction); + return new ABOD<>(kernelFunction); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractAggarwalYuOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractAggarwalYuOutlier.java index 99356aef..2b12b306 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractAggarwalYuOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractAggarwalYuOutlier.java @@ -38,11 +38,12 @@ import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDPair; import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; +import de.lmu.ifi.dbs.elki.math.MathUtil; import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.pairs.IntIntPair; @@ -161,7 +162,7 @@ public abstract class AbstractAggarwalYuOutlier<V extends NumberVector<?>> exten protected static double sparsity(final int setsize, final int dbsize, final int k, final double phi) { // calculate sparsity c final double f = 1. / phi; - final double fK = Math.pow(f, k); + final double fK = MathUtil.powi(f, k); final double sC = (setsize - (dbsize * fK)) / Math.sqrt(dbsize * fK * (1 - fK)); return sC; } @@ -242,12 +243,12 @@ public abstract class AbstractAggarwalYuOutlier<V extends NumberVector<?>> exten protected void makeOptions(Parameterization config) { super.makeOptions(config); final IntParameter kP = new IntParameter(K_ID); - kP.addConstraint(new GreaterEqualConstraint(2)); + kP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); if(config.grab(kP)) { k = kP.getValue(); } final IntParameter phiP = new IntParameter(PHI_ID); - phiP.addConstraint(new GreaterEqualConstraint(2)); + phiP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); if(config.grab(phiP)) { phi = phiP.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuEvolutionary.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuEvolutionary.java index 89be0e66..c4e5cc5d 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuEvolutionary.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuEvolutionary.java @@ -56,7 +56,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
@@ -132,24 +132,24 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA final int dbsize = relation.size();
ArrayList<ArrayList<DBIDs>> ranges = buildRanges(relation);
- Heap<Individuum>.UnorderedIter individuums = (new EvolutionarySearch(relation, ranges, m, rnd.getRandom())).run();
+ Heap<Individuum>.UnorderedIter individuums = (new EvolutionarySearch(relation, ranges, m, rnd.getSingleThreadedRandom())).run();
WritableDoubleDataStore outlierScore = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
- for (; individuums.valid(); individuums.advance()) {
+ for(; individuums.valid(); individuums.advance()) {
DBIDs ids = computeSubspaceForGene(individuums.get().getGene(), ranges);
double sparsityC = sparsity(ids.size(), dbsize, k, phi);
- for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
double prev = outlierScore.doubleValue(iter);
- if (Double.isNaN(prev) || sparsityC < prev) {
+ if(Double.isNaN(prev) || sparsityC < prev) {
outlierScore.putDouble(iter, sparsityC);
}
}
}
DoubleMinMax minmax = new DoubleMinMax();
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double val = outlierScore.doubleValue(iditer);
- if (Double.isNaN(val)) {
+ if(Double.isNaN(val)) {
outlierScore.putDouble(iditer, 0.0);
val = 0.0;
}
@@ -219,12 +219,12 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA ArrayList<Individuum> pop = initialPopulation(m);
// best Population
TopBoundedHeap<Individuum> bestSol = new TopBoundedHeap<>(m, Collections.reverseOrder());
- for (Individuum ind : pop) {
+ for(Individuum ind : pop) {
bestSol.add(ind);
}
int iterations = 0;
- while (!checkConvergence(pop)) {
+ while(!checkConvergence(pop)) {
Collections.sort(pop);
pop = rouletteRankSelection(pop);
// Crossover
@@ -232,28 +232,28 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA // Mutation with probability 0.25 , 0.25
pop = mutation(pop, 0.5, 0.5);
// Avoid duplicates
- ind: for (Individuum ind : pop) {
- for (Heap<Individuum>.UnorderedIter it = bestSol.unorderedIter(); it.valid(); it.advance()) {
- if (it.get().equals(ind)) {
+ ind: for(Individuum ind : pop) {
+ for(Heap<Individuum>.UnorderedIter it = bestSol.unorderedIter(); it.valid(); it.advance()) {
+ if(it.get().equals(ind)) {
continue ind;
}
}
bestSol.add(ind);
}
- if (LOG.isDebuggingFinest()) {
+ if(LOG.isDebuggingFinest()) {
StringBuilder buf = new StringBuilder();
buf.append("Top solutions:\n");
- for (Heap<Individuum>.UnorderedIter it = bestSol.unorderedIter(); it.valid(); it.advance()) {
+ for(Heap<Individuum>.UnorderedIter it = bestSol.unorderedIter(); it.valid(); it.advance()) {
buf.append(it.get().toString()).append('\n');
}
buf.append("Population:\n");
- for (Individuum ind : pop) {
+ for(Individuum ind : pop) {
buf.append(ind.toString()).append('\n');
}
LOG.debugFinest(buf.toString());
}
iterations++;
- if (iterations > MAX_ITERATIONS) {
+ if(iterations > MAX_ITERATIONS) {
LOG.warning("Maximum iterations reached.");
break;
}
@@ -268,18 +268,18 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA * @return Convergence
*/
private boolean checkConvergence(Collection<Individuum> pop) {
- if (pop.size() == 0) {
+ if(pop.size() == 0) {
return true;
}
// Gene occurrence counter
int[][] occur = new int[dim][phi + 1];
// Count gene occurrences
- for (Individuum ind : pop) {
+ for(Individuum ind : pop) {
int[] gene = ind.getGene();
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
int val = gene[d] + DONT_CARE;
- if (val < 0 || val >= phi + 1) {
+ if(val < 0 || val >= phi + 1) {
LOG.warning("Invalid gene value encountered: " + val + " in " + ind.toString());
continue;
}
@@ -288,20 +288,20 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA }
int conv = (int) (pop.size() * 0.95);
- if (LOG.isDebuggingFine()) {
+ if(LOG.isDebuggingFine()) {
LOG.debugFine("Convergence at " + conv + " of " + pop.size() + " individuums.");
}
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
boolean converged = false;
- for (int val = 0; val < phi + 1; val++) {
- if (occur[d][val] >= conv) {
+ for(int val = 0; val < phi + 1; val++) {
+ if(occur[d][val] >= conv) {
converged = true;
break;
}
}
// A single failure to converge is sufficient to continue.
- if (!converged) {
+ if(!converged) {
return false;
}
}
@@ -318,19 +318,19 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA // Initial Population
ArrayList<Individuum> population = new ArrayList<>(popsize);
// fill population
- for (int i = 0; i < popsize; i++) {
+ for(int i = 0; i < popsize; i++) {
// Random Individual
int[] gene = new int[dim];
// fill don't care ( any dimension == don't care)
- for (int j = 0; j < dim; j++) {
+ for(int j = 0; j < dim; j++) {
gene[j] = DONT_CARE;
}
// count of don't care positions
int countDim = k;
// fill non don't care positions of the Individual
- while (countDim > 0) {
+ while(countDim > 0) {
int z = random.nextInt(dim);
- if (gene[z] == DONT_CARE) {
+ if(gene[z] == DONT_CARE) {
gene[z] = random.nextInt(phi) + 1;
countDim--;
}
@@ -361,20 +361,21 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA ArrayList<Individuum> survivors = new ArrayList<>(popsize);
// position of selection
- for (int i = 0; i < popsize; i++) {
+ for(int i = 0; i < popsize; i++) {
int z = random.nextInt(totalweight);
- for (int j = 0; j < popsize; j++) {
- if (z < popsize - j) {
+ for(int j = 0; j < popsize; j++) {
+ if(z < popsize - j) {
// TODO: need clone?
survivors.add(population.get(j));
break;
- } else {
+ }
+ else {
// decrement
z -= (popsize - j);
}
}
}
- if (survivors.size() != popsize) {
+ if(survivors.size() != popsize) {
throw new AbortException("Selection step failed - implementation error?");
}
// Don't sort, to avoid biasing the crossover!
@@ -394,23 +395,24 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA TreeSet<Integer> R = new TreeSet<>();
// for each individuum
- for (int j = 0; j < population.size(); j++) {
+ for(int j = 0; j < population.size(); j++) {
// clear the Sets
Q.clear();
R.clear();
// Fill the Sets with the Positions
- for (int i = 0; i < dim; i++) {
- if (population.get(j).getGene()[i] == DONT_CARE) {
+ for(int i = 0; i < dim; i++) {
+ if(population.get(j).getGene()[i] == DONT_CARE) {
Q.add(i);
- } else {
+ }
+ else {
R.add(i);
}
}
//
double r1 = random.nextDouble();
- if (Q.size() != 0) {
+ if(Q.size() != 0) {
// Mutation Variant 1
- if (r1 <= perc1) {
+ if(r1 <= perc1) {
// calc Mutation Spot
Integer[] pos = new Integer[Q.size()];
pos = Q.toArray(pos);
@@ -435,7 +437,7 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA }
r1 = random.nextDouble();
// Mutation Variant 2
- if (r1 <= perc2) {
+ if(r1 <= perc2) {
// calc Mutation Spot
Integer[] pos = new Integer[R.size()];
pos = R.toArray(pos);
@@ -471,14 +473,14 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA // Crossover Set of population Set
ArrayList<Individuum> crossover = new ArrayList<>();
- for (int i = 0; i < population.size() - 1; i += 2) {
+ for(int i = 0; i < population.size() - 1; i += 2) {
Pair<Individuum, Individuum> recombine = recombineOptimized(population.get(i), population.get(i + 1));
// add the Solutions to the new Set
crossover.add(recombine.getFirst());
crossover.add(recombine.getSecond());
}
// if the set contains an odd number of Subspaces, retain the last one
- if (population.size() % 2 == 1) {
+ if(population.size() % 2 == 1) {
crossover.add(population.get(population.size() - 1));
}
// Collections.sort(crossover);
@@ -499,14 +501,14 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA // Set of Positions in which neither s1 or s2 is don't care
ArrayList<Integer> R = new ArrayList<>(dim);
- for (int i = 0; i < dim; i++) {
- if ((parent1.getGene()[i] == DONT_CARE) && (parent2.getGene()[i] != DONT_CARE)) {
+ for(int i = 0; i < dim; i++) {
+ if((parent1.getGene()[i] == DONT_CARE) && (parent2.getGene()[i] != DONT_CARE)) {
Q.add(i);
}
- if ((parent1.getGene()[i] != DONT_CARE) && (parent2.getGene()[i] == DONT_CARE)) {
+ if((parent1.getGene()[i] != DONT_CARE) && (parent2.getGene()[i] == DONT_CARE)) {
Q.add(i);
}
- if ((parent1.getGene()[i] != DONT_CARE) && (parent2.getGene()[i] != DONT_CARE)) {
+ if((parent1.getGene()[i] != DONT_CARE) && (parent2.getGene()[i] != DONT_CARE)) {
R.add(i);
}
}
@@ -518,11 +520,11 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA int count = k - R.size();
Iterator<Integer> q = Q.iterator();
- while (count > 0) {
+ while(count > 0) {
int[] l1 = b.clone();
int[] l2 = b.clone();
- while (q.hasNext()) {
+ while(q.hasNext()) {
int next = q.next();
// pos = next;
@@ -536,14 +538,15 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA final double sparsityL1 = sparsity(computeSubspaceForGene(l1, ranges).size(), dbsize, k, phi);
final double sparsityL2 = sparsity(computeSubspaceForGene(l2, ranges).size(), dbsize, k, phi);
- if (sparsityL1 <= sparsityL2) {
+ if(sparsityL1 <= sparsityL2) {
b = l1.clone();
- if (s1Null) {
+ if(s1Null) {
count--;
}
- } else {
+ }
+ else {
b = l2.clone();
- if (s2Null) {
+ if(s2Null) {
count--;
}
}
@@ -555,10 +558,11 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA // create the complementary String
int[] comp = new int[dim];
- for (int i = 0; i < dim; i++) {
- if (b[i] == parent1.getGene()[i]) {
+ for(int i = 0; i < dim; i++) {
+ if(b[i] == parent1.getGene()[i]) {
comp[i] = parent2.getGene()[i];
- } else {
+ }
+ else {
comp[i] = parent2.getGene()[i];
}
}
@@ -581,7 +585,7 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA * @return best gene combination
*/
private Individuum combineRecursive(ArrayList<Integer> r, int i, int[] current, Individuum parent1, Individuum parent2) {
- if (i == r.size()) {
+ if(i == r.size()) {
return makeIndividuum(current);
}
// Position to modify
@@ -594,9 +598,10 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA Individuum i1 = combineRecursive(r, i + 1, gene1, parent1, parent2);
Individuum i2 = combineRecursive(r, i + 1, gene2, parent1, parent2);
// Return the better result.
- if (i1.getFitness() < i2.getFitness()) {
+ if(i1.getFitness() < i2.getFitness()) {
return i1;
- } else {
+ }
+ else {
return i2;
}
}
@@ -657,15 +662,15 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA @Override
public boolean equals(Object obj) {
- if (!(obj instanceof Individuum)) {
+ if(!(obj instanceof Individuum)) {
return false;
}
Individuum other = (Individuum) obj;
- if (other.second.length != this.second.length) {
+ if(other.second.length != this.second.length) {
return false;
}
- for (int i = 0; i < this.second.length; i++) {
- if (other.second[i] != this.second[i]) {
+ for(int i = 0; i < this.second.length; i++) {
+ if(other.second[i] != this.second[i]) {
return false;
}
}
@@ -703,12 +708,12 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final IntParameter mP = new IntParameter(M_ID);
- mP.addConstraint(new GreaterEqualConstraint(2));
- if (config.grab(mP)) {
+ mP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
+ if(config.grab(mP)) {
m = mP.getValue();
}
final RandomParameter rndP = new RandomParameter(SEED_ID);
- if (config.grab(rndP)) {
+ if(config.grab(rndP)) {
rnd = rndP.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/COP.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/COP.java index 06168c5a..190d14fe 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/COP.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/COP.java @@ -62,11 +62,12 @@ import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.EnumParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; @@ -194,6 +195,11 @@ public class COP<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte DistanceDist dist = DistanceDist.CHISQUARED; /** + * Include models in output. + */ + boolean models; + + /** * Constructor. * * @param distanceFunction distance function @@ -201,13 +207,15 @@ public class COP<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte * @param pca PCA computation method * @param expect Expected fraction of outliers (for score normalization) * @param dist Distance distribution model (ChiSquared, Gamma) + * @param models Report models */ - public COP(DistanceFunction<? super V, D> distanceFunction, int k, PCARunner<V> pca, double expect, DistanceDist dist) { + public COP(DistanceFunction<? super V, D> distanceFunction, int k, PCARunner<V> pca, double expect, DistanceDist dist, boolean models) { super(distanceFunction); this.k = k; this.pca = pca; this.expect = expect; this.dist = dist; + this.models = models; } /** @@ -221,22 +229,26 @@ public class COP<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte KNNQuery<V, D> knnQuery = QueryUtil.getKNNQuery(relation, getDistanceFunction(), k + 1); final int dim = RelationUtil.dimensionality(relation); - if (k <= dim + 1) { + if(k <= dim + 1) { LOG.warning("PCA is underspecified with a too low k! k should be at much larger than " + dim); } WritableDoubleDataStore cop_score = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); - WritableDataStore<Vector> cop_err_v = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, Vector.class); - WritableIntegerDataStore cop_dim = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, -1); + WritableDataStore<Vector> cop_err_v = null; + WritableIntegerDataStore cop_dim = null; + if(models) { + cop_err_v = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, Vector.class); + cop_dim = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, -1); + } // compute neighbors of each db object FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Correlation Outlier Probabilities", relation.size(), LOG) : null; - for (DBIDIter id = ids.iter(); id.valid(); id.advance()) { + for(DBIDIter id = ids.iter(); id.valid(); id.advance()) { KNNList<D> neighbors = knnQuery.getKNNForDBID(id, k + 1); ModifiableDBIDs nids = DBIDUtil.newHashSet(neighbors); nids.remove(id); // Do not use query object - Vector centroid = Centroid.make(relation, nids).toVector(relation).getColumnVector(); + Vector centroid = Centroid.make(relation, nids); Vector relative = relation.get(id).getColumnVector().minusEquals(centroid); PCAResult pcares = pca.processIds(nids, relation); @@ -246,17 +258,17 @@ public class COP<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte double min = Double.POSITIVE_INFINITY; int vdim = dim; - switch(dist) { + switch(dist){ case CHISQUARED: { double sqdevs = 0; - for (int d = 0; d < dim; d++) { + for(int d = 0; d < dim; d++) { // Scale with Stddev double dev = projected.get(d); // Accumulate sqdevs += dev * dev / evs[d]; // Evaluate double score = 1 - ChiSquaredDistribution.cdf(sqdevs, d + 1); - if (score < min) { + if(score < min) { min = score; vdim = d + 1; } @@ -267,21 +279,21 @@ public class COP<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte double[][] dists = new double[dim][nids.size()]; int j = 0; Vector srel = new Vector(dim); - for (DBIDIter s = nids.iter(); s.valid() && j < nids.size(); s.advance()) { + for(DBIDIter s = nids.iter(); s.valid() && j < nids.size(); s.advance()) { V vec = relation.get(s); - for (int d = 0; d < dim; d++) { + for(int d = 0; d < dim; d++) { srel.set(d, vec.doubleValue(d) - centroid.get(d)); } Vector serr = evecs.transposeTimes(srel); double sqdist = 0.0; - for (int d = 0; d < dim; d++) { + for(int d = 0; d < dim; d++) { sqdist += serr.get(d) * serr.get(d) / evs[d]; dists[d][j] = sqdist; } j++; } double sqdevs = 0; - for (int d = 0; d < dim; d++) { + for(int d = 0; d < dim; d++) { // Scale with Stddev final double dev = projected.get(d); // Accumulate @@ -290,7 +302,7 @@ public class COP<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte Arrays.sort(dists[d]); // Evaluate double score = 1 - GammaChoiWetteEstimator.STATIC.estimate(dists[d], SHORTENED_ARRAY).cdf(sqdevs); - if (score < min) { + if(score < min) { min = score; vdim = d + 1; } @@ -301,20 +313,22 @@ public class COP<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte // Normalize the value final double prob = expect * (1 - min) / (expect + min); // Construct the error vector: - for (int d = vdim; d < dim; d++) { + for(int d = vdim; d < dim; d++) { projected.set(d, 0.0); } Vector ev = evecs.times(projected).timesEquals(-1 * prob); cop_score.putDouble(id, prob); - cop_err_v.put(id, ev); - cop_dim.putInt(id, dim + 1 - vdim); + if(models) { + cop_err_v.put(id, ev); + cop_dim.putInt(id, dim + 1 - vdim); + } - if (prog != null) { + if(prog != null) { prog.incrementProcessed(LOG); } } - if (prog != null) { + if(prog != null) { prog.ensureCompleted(LOG); } @@ -322,8 +336,10 @@ public class COP<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte Relation<Double> scoreResult = new MaterializedRelation<>("Correlation Outlier Probabilities", COP_SCORES, TypeUtil.DOUBLE, cop_score, ids); OutlierScoreMeta scoreMeta = new ProbabilisticOutlierScore(); OutlierResult result = new OutlierResult(scoreMeta, scoreResult); - result.addChildResult(new MaterializedRelation<>("Local Dimensionality", COP_DIM, TypeUtil.INTEGER, cop_dim, ids)); - result.addChildResult(new MaterializedRelation<>("Error vectors", COP_ERRORVEC, TypeUtil.VECTOR, cop_err_v, ids)); + if(models) { + result.addChildResult(new MaterializedRelation<>("Local Dimensionality", COP_DIM, TypeUtil.INTEGER, cop_dim, ids)); + result.addChildResult(new MaterializedRelation<>("Error vectors", COP_ERRORVEC, TypeUtil.VECTOR, cop_err_v, ids)); + } return result; } @@ -382,6 +398,16 @@ public class COP<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte public static final OptionID EXPECT_ID = new OptionID("cop.expect", "Expected share of outliers. Only affect score normalization."); /** + * Include COP error vectors in output. + * <p> + * Key: {@code -cop.models} + * + * Default: off + * </p> + */ + public static final OptionID MODELS_ID = new OptionID("cop.models", "Include COP models (error vectors) in output. This needs more memory."); + + /** * Number of neighbors to be considered. */ int k; @@ -401,33 +427,42 @@ public class COP<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte */ double expect; + /** + * Include COP models + */ + boolean models = false; + @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); IntParameter kP = new IntParameter(K_ID); kP.addConstraint(new GreaterConstraint(5)); - if (config.grab(kP)) { + if(config.grab(kP)) { k = kP.intValue(); } EnumParameter<DistanceDist> distP = new EnumParameter<>(DIST_ID, DistanceDist.class, DistanceDist.GAMMA); - if (config.grab(distP)) { + if(config.grab(distP)) { dist = distP.getValue(); } DoubleParameter expectP = new DoubleParameter(EXPECT_ID, 0.001); - expectP.addConstraint(new GreaterConstraint(0)); - expectP.addConstraint(new LessConstraint(1.0)); - if (config.grab(expectP)) { + expectP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); + expectP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE); + if(config.grab(expectP)) { expect = expectP.doubleValue(); } ObjectParameter<PCARunner<V>> pcaP = new ObjectParameter<>(PCARUNNER_ID, PCARunner.class, PCARunner.class); - if (config.grab(pcaP)) { + if(config.grab(pcaP)) { pca = pcaP.instantiateClass(config); } + Flag modelsF = new Flag(MODELS_ID); + if(config.grab(modelsF)) { + models = modelsF.isTrue(); + } } @Override protected COP<V, D> makeInstance() { - return new COP<>(distanceFunction, k, pca, expect, dist); + return new COP<>(distanceFunction, k, pca, expect, dist, models); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/DWOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/DWOF.java new file mode 100644 index 00000000..ef782390 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/DWOF.java @@ -0,0 +1,407 @@ +package de.lmu.ifi.dbs.elki.algorithm.outlier;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList;
+import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDListIter;
+import de.lmu.ifi.dbs.elki.database.ids.distance.KNNList;
+import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
+import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.progress.StepProgress;
+import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
+import de.lmu.ifi.dbs.elki.math.Mean;
+import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta;
+import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
+import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+
+/**
+ * <p>
+ * Algorithm to compute dynamic-window outlier factors in a database based on a
+ * specified parameter {@link Parameterizer#K_ID} ({@code -dwof.k}).
+ * </p>
+ *
+ * <p>
+ * The parameter {@link Parameterizer#K_ID} specifies the number of the
+ * neighbors to be considered during the calculation of the DWOF score.
+ * </p>
+ *
+ * <p>
+ * All the distance queries -KNN and Range- are determined using the parameter
+ * {@link AbstractDistanceBasedAlgorithm#DISTANCE_FUNCTION_ID}
+ * </p>
+ *
+ * <p>
+ * Reference: <br>
+ * Rana Momtaz, Nesma Mohssen and Mohammad A. Gowayyed: DWOF: A Robust
+ * Density-Based OutlierDetection Approach. <br>
+ * In: Pattern Recognition and Image Analysis , Proc. 6th Iberian Conference,
+ * IbPRIA 2013, Funchal, Madeira, Portugal, June 5-7, 2013.
+ * </p>
+ *
+ * @author Omar Yousry
+ *
+ * @param <O> the type of DatabaseObjects handled by this Algorithm
+ * @param <D> Distance type
+ */
+
+@Title("DWOF: Dynamic Window Outlier Factor")
+@Description("Algorithm to compute dynamic-window outlier factors in a database based on the neighborhood size parameter 'k'")
+@Reference(authors = "R. Momtaz, N. Mohssen, M. A. Gowayyed", title = "DWOF: A Robust Density-Based OutlierDetection Approach", booktitle = "Pattern Recognition and Image Analysis, Proc. 6th Iberian Conference, IbPRIA 2013, Funchal, Madeira, Portugal, 2013.", url = "http://dx.doi.org/10.1007%2F978-3-642-38628-2_61")
+public class DWOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<O, D, OutlierResult> implements OutlierAlgorithm {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(DWOF.class);
+
+ /**
+ * Holds the value of {@link Parameterizer#K_ID} i.e. Number of neighbors to
+ * consider during the calculation of DWOF scores.
+ */
+ protected int k;
+
+ /**
+ * The radii changing ratio
+ */
+ private double delta = 1.1;
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction Distance function to use in queries
+ * @param k the value of k
+ * @param delta Radius increase factor
+ */
+ public DWOF(DistanceFunction<? super O, D> distanceFunction, int k, double delta) {
+ super(distanceFunction);
+ this.k = k + 1;
+ this.delta = delta;
+ }
+
+ /**
+ * Performs the Generalized DWOF_SCORE algorithm on the given database by
+ * calling all the other methods in the proper order.
+ *
+ * @param database Database to query
+ * @param relation Data to process
+ * @return new OutlierResult instance
+ */
+ public OutlierResult run(Database database, Relation<O> relation) {
+ final DBIDs ids = relation.getDBIDs();
+ DistanceQuery<O, D> distFunc = database.getDistanceQuery(relation, getDistanceFunction());
+ // Get k nearest neighbor and range query on the relation.
+ KNNQuery<O, D> knnq = database.getKNNQuery(distFunc, k, DatabaseQuery.HINT_HEAVY_USE);
+ RangeQuery<O, D> rnnQuery = database.getRangeQuery(distFunc, DatabaseQuery.HINT_HEAVY_USE);
+
+ StepProgress stepProg = LOG.isVerbose() ? new StepProgress("DWOF", 2) : null;
+ // DWOF output score storage.
+ WritableDoubleDataStore dwofs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_DB | DataStoreFactory.HINT_HOT, 0.);
+ if(stepProg != null) {
+ stepProg.beginStep(1, "Initializing objects' Radii", LOG);
+ }
+ WritableDoubleDataStore radii = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, 0.);
+ // Find an initial radius for each object:
+ initializeRadii(ids, knnq, distFunc, radii);
+ WritableIntegerDataStore oldSizes = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT, 1);
+ WritableIntegerDataStore newSizes = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT, 1);
+ int countUnmerged = relation.size();
+ if(stepProg != null) {
+ stepProg.beginStep(2, "Clustering-Evaluating Cycles.", LOG);
+ }
+ IndefiniteProgress clusEvalProgress = LOG.isVerbose() ? new IndefiniteProgress("Evaluating DWOFs", LOG) : null;
+ while(countUnmerged > 0) {
+ if(clusEvalProgress != null) {
+ clusEvalProgress.incrementProcessed(LOG);
+ }
+ // Increase radii
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ radii.putDouble(iter, radii.doubleValue(iter) * delta);
+ }
+ // stores the clustering label for each object
+ WritableDataStore<ModifiableDBIDs> labels = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_TEMP, ModifiableDBIDs.class);
+ // Cluster objects based on the current radius
+ clusterData(ids, rnnQuery, radii, labels);
+ // simple reference swap
+ WritableIntegerDataStore temp = newSizes;
+ newSizes = oldSizes;
+ oldSizes = temp;
+
+ // Update the cluster size count for each object.
+ countUnmerged = updateSizes(ids, labels, newSizes);
+ labels.destroy();
+ // Update DWOF scores.
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ double newScore = (newSizes.intValue(iter) > 0) ? ((double) (oldSizes.intValue(iter) - 1) / (double) newSizes.intValue(iter)) : 0.0;
+ dwofs.putDouble(iter, dwofs.doubleValue(iter) + newScore);
+ }
+ }
+ if(clusEvalProgress != null) {
+ clusEvalProgress.setCompleted(LOG);
+ }
+ if(stepProg != null) {
+ stepProg.setCompleted(LOG);
+ }
+ // Build result representation.
+ DoubleMinMax minmax = new DoubleMinMax();
+ for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
+ minmax.put(dwofs.doubleValue(iter));
+ }
+ OutlierScoreMeta meta = new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY);
+ Relation<Double> rel = new MaterializedRelation<>("Dynamic-Window Outlier Factors", "dwof-outlier", TypeUtil.DOUBLE, dwofs, ids);
+ return new OutlierResult(meta, rel);
+ }
+
+ /**
+ * This method prepares a container for the radii of the objects and
+ * initializes radii according to the equation:
+ *
+ * initialRadii of a certain object = (absoluteMinDist of all objects) *
+ * (avgDist of the object) / (minAvgDist of all objects)
+ *
+ * @param ids Database IDs to process
+ * @param distFunc Distance function
+ * @param knnq kNN search function
+ * @param radii WritableDoubleDataStore to store radii
+ */
+ private void initializeRadii(DBIDs ids, KNNQuery<O, D> knnq, DistanceQuery<O, D> distFunc, WritableDoubleDataStore radii) {
+ FiniteProgress avgDistProgress = LOG.isVerbose() ? new FiniteProgress("Calculating average kNN distances-", ids.size(), LOG) : null;
+ double absoluteMinDist = Double.POSITIVE_INFINITY;
+ double minAvgDist = Double.POSITIVE_INFINITY;
+ // to get the mean for each object
+ Mean mean = new Mean();
+ // Iterate over all objects
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ KNNList<D> iterNeighbors = knnq.getKNNForDBID(iter, k);
+ // skip the point itself
+ mean.reset();
+ for(DBIDIter neighbor1 = iterNeighbors.iter(); neighbor1.valid(); neighbor1.advance()) {
+ if(DBIDUtil.equal(neighbor1, iter)) {
+ continue;
+ }
+ for(DBIDIter neighbor2 = iterNeighbors.iter(); neighbor2.valid(); neighbor2.advance()) {
+ if(DBIDUtil.equal(neighbor1, neighbor2) || DBIDUtil.equal(neighbor2, iter)) {
+ continue;
+ }
+ double distance = distFunc.distance(neighbor1, neighbor2).doubleValue();
+ mean.put(distance);
+ if(distance > 0. && distance < absoluteMinDist) {
+ absoluteMinDist = distance;
+ }
+ }
+ }
+ double currentMean = mean.getMean();
+ radii.putDouble(iter, currentMean);
+ if(currentMean < minAvgDist) {
+ minAvgDist = currentMean;
+ }
+ if(avgDistProgress != null) {
+ avgDistProgress.incrementProcessed(LOG);
+ }
+ }
+ if(avgDistProgress != null) {
+ avgDistProgress.ensureCompleted(LOG);
+ }
+
+ // Initializing the radii of all objects.
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ radii.putDouble(iter, (minAvgDist > 0) ? (absoluteMinDist * radii.doubleValue(iter) / minAvgDist) : Double.POSITIVE_INFINITY);
+ }
+ }
+
+ /**
+ * This method applies a density based clustering algorithm.
+ *
+ * It looks for an unclustered object and builds a new cluster for it, then
+ * adds all the points within its radius to that cluster.
+ *
+ * nChain represents the points to be added to the cluster and its
+ * radius-neighbors
+ *
+ * @param ids Database IDs to process
+ * @param rnnQuery Data to process
+ * @param radii Radii to cluster accordingly
+ * @param labels Label storage.
+ */
+ private void clusterData(DBIDs ids, RangeQuery<O, D> rnnQuery, WritableDoubleDataStore radii, WritableDataStore<ModifiableDBIDs> labels) {
+ FiniteProgress clustProg = LOG.isVerbose() ? new FiniteProgress("Density-Based Clustering", ids.size(), LOG) : null;
+ // Iterate over all objects
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ if(labels.get(iter) != null) {
+ continue;
+ }
+ ModifiableDBIDs newCluster = DBIDUtil.newArray();
+ newCluster.add(iter);
+ labels.put(iter, newCluster);
+ if(clustProg != null) {
+ clustProg.incrementProcessed(LOG);
+ }
+ // container of the points to be added and their radii neighbors to the
+ // cluster
+ ModifiableDBIDs nChain = DBIDUtil.newArray();
+ nChain.add(iter);
+ // iterate over nChain
+ for(DBIDIter toGetNeighbors = nChain.iter(); toGetNeighbors.valid(); toGetNeighbors.advance()) {
+ D range = rnnQuery.getDistanceFactory().fromDouble(radii.doubleValue(toGetNeighbors));
+ DistanceDBIDList<D> nNeighbors = rnnQuery.getRangeForDBID(toGetNeighbors, range);
+ for(DistanceDBIDListIter<D> iter2 = nNeighbors.iter(); iter2.valid(); iter2.advance()) {
+ if(DBIDUtil.equal(toGetNeighbors, iter2)) {
+ continue;
+ }
+ if(labels.get(iter2) == null) {
+ newCluster.add(iter2);
+ labels.put(iter2, newCluster);
+ nChain.add(iter2);
+ if(clustProg != null) {
+ clustProg.incrementProcessed(LOG);
+ }
+ }
+ else if(labels.get(iter2) != newCluster) {
+ ModifiableDBIDs toBeDeleted = labels.get(iter2);
+ newCluster.addDBIDs(toBeDeleted);
+ for(DBIDIter iter3 = toBeDeleted.iter(); iter3.valid(); iter3.advance()) {
+ labels.put(iter3, newCluster);
+ }
+ toBeDeleted.clear();
+ }
+ }
+ }
+ }
+ if(clustProg != null) {
+ clustProg.ensureCompleted(LOG);
+ }
+ }
+
+ /**
+ * This method updates each object's cluster size after the clustering step.
+ *
+ * @param ids Object IDs to process
+ * @param labels references for each object's cluster
+ * @param newSizes the sizes container to be updated
+ * @return the number of unclustered objects
+ */
+ private int updateSizes(DBIDs ids, WritableDataStore<ModifiableDBIDs> labels, WritableIntegerDataStore newSizes) {
+ // to count the unclustered all over
+ int countUnmerged = 0;
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ // checking the point's new cluster size after the clustering step
+ int newClusterSize = labels.get(iter).size();
+ newSizes.putInt(iter, newClusterSize);
+ // the point is alone in the cluster --> not merged with other points
+ if(newClusterSize == 1) {
+ countUnmerged++;
+ }
+ }
+ return countUnmerged;
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(getDistanceFunction().getInputTypeRestriction());
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Omar Yousry
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> {
+ /**
+ * Option ID for the number of neighbors.
+ */
+ public static final OptionID K_ID = OptionID.getOrCreateOptionID("dwof.k", "Number of neighbors to get for DWOF score outlier detection.");
+
+ /**
+ * Option ID for radius increases
+ */
+ public static final OptionID DELTA_ID = OptionID.getOrCreateOptionID("dwof.delta", "Radius increase factor.");
+
+ /**
+ * Number of neighbors to get
+ */
+ protected int k = 2;
+
+ /**
+ * Radius increase factor.
+ */
+ protected double delta = 1.1;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ // The super class has the distance function parameter!
+ super.makeOptions(config);
+ IntParameter kP = new IntParameter(K_ID);
+ kP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
+ if(config.grab(kP)) {
+ k = kP.getValue();
+ }
+ DoubleParameter deltaP = new DoubleParameter(DELTA_ID);
+ deltaP.setDefaultValue(1.1);
+ deltaP.addConstraint(CommonConstraints.GREATER_THAN_ONE_DOUBLE);
+ if(config.grab(deltaP)) {
+ delta = deltaP.getValue();
+ }
+ }
+
+ @Override
+ protected DWOF<O, D> makeInstance() {
+ return new DWOF<>(distanceFunction, k, delta);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/EMOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/EMOutlier.java index f8fd686f..76191cf2 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/EMOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/EMOutlier.java @@ -38,10 +38,12 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.result.Result;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.ProbabilisticOutlierScore;
import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.Hierarchy.Iter;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
@@ -91,15 +93,27 @@ public class EMOutlier<V extends NumberVector<?>> extends AbstractAlgorithm<Outl * @return Outlier result
*/
public OutlierResult run(Database database, Relation<V> relation) {
+ emClustering.setSoft(true);
Clustering<EMModel<V>> emresult = emClustering.run(database, relation);
+ Relation<double[]> soft = null;
+ for (Iter<Result> iter = emresult.getHierarchy().iterChildren(emresult); iter.valid(); iter.advance()) {
+ if (!(iter.get() instanceof Relation)) {
+ continue;
+ }
+ if (((Relation<?>) iter.get()).getDataTypeInformation() == EM.SOFT_TYPE) {
+ @SuppressWarnings("unchecked")
+ Relation<double[]> rel = (Relation<double[]>) iter.get();
+ soft = rel;
+ }
+ }
double globmax = 0.0;
WritableDoubleDataStore emo_score = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT);
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double maxProb = Double.POSITIVE_INFINITY;
- double[] probs = emClustering.getProbClusterIGivenX(iditer);
- for(double prob : probs) {
- maxProb = Math.min(1 - prob, maxProb);
+ double[] probs = soft.get(iditer);
+ for (double prob : probs) {
+ maxProb = Math.min(1. - prob, maxProb);
}
emo_score.putDouble(iditer, maxProb);
globmax = Math.max(maxProb, globmax);
@@ -145,4 +159,4 @@ public class EMOutlier<V extends NumberVector<?>> extends AbstractAlgorithm<Outl return new EMOutlier<>(em);
}
}
-}
\ No newline at end of file +}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/FastABOD.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/FastABOD.java new file mode 100644 index 00000000..ee6bd434 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/FastABOD.java @@ -0,0 +1,219 @@ +package de.lmu.ifi.dbs.elki.algorithm.outlier; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDPair; +import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; +import de.lmu.ifi.dbs.elki.distance.similarityfunction.SimilarityFunction; +import de.lmu.ifi.dbs.elki.distance.similarityfunction.kernel.KernelMatrix; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; +import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ComparableMaxHeap; +import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ObjectHeap; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; + +/** + * Angle-Based Outlier Detection / Angle-Based Outlier Factor. + * + * Fast-ABOD (approximateABOF) version. + * + * H.-P. Kriegel, M. Schubert, and A. Zimek: Angle-Based Outlier Detection in + * High-dimensional Data. In: Proc. 14th ACM SIGKDD Int. Conf. on Knowledge + * Discovery and Data Mining (KDD '08), Las Vegas, NV, 2008. + * + * @author Matthias Schubert (Original Code) + * @author Erich Schubert (ELKIfication) + * + * @param <V> Vector type + */ +@Title("Approximate ABOD: Angle-Based Outlier Detection") +@Description("Outlier detection using variance analysis on angles, especially for high dimensional data sets.") +@Reference(authors = "H.-P. Kriegel, M. Schubert, and A. Zimek", title = "Angle-Based Outlier Detection in High-dimensional Data", booktitle = "Proc. 14th ACM SIGKDD Int. Conf. on Knowledge Discovery and Data Mining (KDD '08), Las Vegas, NV, 2008", url = "http://dx.doi.org/10.1145/1401890.1401946") +public class FastABOD<V extends NumberVector<?>> extends ABOD<V> { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(FastABOD.class); + + /** + * Number of nearest neighbors. + */ + protected int k; + + /** + * Constructor for Angle-Based Outlier Detection (ABOD). + * + * @param kernelFunction kernel function to use + * @param k Number of nearest neighbors + */ + public FastABOD(SimilarityFunction<? super V, DoubleDistance> kernelFunction, int k) { + super(kernelFunction); + this.k = k; + } + + /** + * Run Fast-ABOD on the data set. + * + * @param relation Relation to process + * @return Outlier detection result + */ + @Override + public OutlierResult run(Database db, Relation<V> relation) { + DBIDs ids = relation.getDBIDs(); + // Build a kernel matrix, to make O(n^3) slightly less bad. + SimilarityQuery<V, DoubleDistance> sq = db.getSimilarityQuery(relation, kernelFunction); + KernelMatrix kernelMatrix = new KernelMatrix(sq, relation, ids); + + WritableDoubleDataStore abodvalues = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC); + DoubleMinMax minmaxabod = new DoubleMinMax(); + + MeanVariance s = new MeanVariance(); + for (DBIDIter pA = ids.iter(); pA.valid(); pA.advance()) { + s.reset(); + final double simAA = kernelMatrix.getSimilarity(pA, pA); + + // Choose the k-min nearest + ComparableMaxHeap<DoubleDBIDPair> nn = new ComparableMaxHeap<>(k); + for (DBIDIter nB = relation.iterDBIDs(); nB.valid(); nB.advance()) { + if (DBIDUtil.equal(nB, pA)) { + continue; + } + double simBB = kernelMatrix.getSimilarity(nB, nB); + double simAB = kernelMatrix.getSimilarity(pA, nB); + double sqdAB = simAA + simBB - simAB - simAB; + if (!(sqdAB > 0.)) { + continue; + } + if (nn.size() < k) { + nn.add(DBIDUtil.newPair(sqdAB, nB)); + } else if (sqdAB < nn.peek().doubleValue()) { + nn.replaceTopElement(DBIDUtil.newPair(sqdAB, nB)); + } + } + + for (ObjectHeap.UnsortedIter<DoubleDBIDPair> iB = nn.unsortedIter(); iB.valid(); iB.advance()) { + DoubleDBIDPair nB = iB.get(); + double sqdAB = nB.doubleValue(); + double simAB = kernelMatrix.getSimilarity(pA, nB); + if (!(sqdAB > 0.)) { + continue; + } + for (ObjectHeap.UnsortedIter<DoubleDBIDPair> iC = nn.unsortedIter(); iC.valid(); iC.advance()) { + DoubleDBIDPair nC = iC.get(); + if (DBIDUtil.compare(nC, nB) < 0) { + continue; + } + double sqdAC = nC.doubleValue(); + double simAC = kernelMatrix.getSimilarity(pA, nC); + if (!(sqdAC > 0.)) { + continue; + } + // Exploit bilinearity of scalar product: + // <B-A, C-A> = <B, C-A> - <A,C-A> + // = <B,C> - <B,A> - <A,C> + <A,A> + // For computing variance, AA is a constant and can be ignored. + double simBC = kernelMatrix.getSimilarity(nB, nC); + double numerator = simBC - simAB - simAC; // + simAA; + double val = numerator / (sqdAB * sqdAC); + s.put(val, 1. / Math.sqrt(sqdAB * sqdAC)); + } + } + // Sample variance probably would be correct, but the ABOD publication + // uses the naive variance. + final double abof = s.getNaiveVariance(); + minmaxabod.put(abof); + abodvalues.putDouble(pA, abof); + } + + // Build result representation. + Relation<Double> scoreResult = new MaterializedRelation<>("Angle-Based Outlier Degree", "abod-outlier", TypeUtil.DOUBLE, abodvalues, relation.getDBIDs()); + OutlierScoreMeta scoreMeta = new InvertedOutlierScoreMeta(minmaxabod.getMin(), minmaxabod.getMax(), 0.0, Double.POSITIVE_INFINITY); + return new OutlierResult(scoreMeta, scoreResult); + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector<?>> extends ABOD.Parameterizer<V> { + /** + * Parameter for the nearest neighbors. + */ + public static final OptionID K_ID = new OptionID("fastabod.k", "Number of nearest neighbors to use for ABOD."); + + /** + * Number of neighbors. + */ + protected int k; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + final IntParameter kP = new IntParameter(K_ID); + if (config.grab(kP)) { + k = kP.intValue(); + } + } + + @Override + protected FastABOD<V> makeInstance() { + return new FastABOD<>(kernelFunction, k); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianModel.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianModel.java index c9e6a634..3f8bb484 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianModel.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianModel.java @@ -112,7 +112,7 @@ public class GaussianModel<V extends NumberVector<?>> extends AbstractAlgorithm< Matrix covarianceTransposed = covarianceMatrix.cheatToAvoidSingularity(SINGULARITY_CHEAT).inverse();
// Normalization factors for Gaussian PDF
- final double fakt = (1.0 / (Math.sqrt(Math.pow(MathUtil.TWOPI, RelationUtil.dimensionality(relation)) * covarianceMatrix.det())));
+ final double fakt = (1.0 / (Math.sqrt(MathUtil.powi(MathUtil.TWOPI, RelationUtil.dimensionality(relation)) * covarianceMatrix.det())));
// for each object compute Mahalanobis distance
for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianUniformMixture.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianUniformMixture.java index 294592e8..e6659a8f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianUniformMixture.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianUniformMixture.java @@ -219,7 +219,7 @@ public class GaussianUniformMixture<V extends NumberVector<?>> extends AbstractA Matrix covInv = covarianceMatrix.cheatToAvoidSingularity(SINGULARITY_CHEAT).inverse();
double covarianceDet = covarianceMatrix.det();
- double fakt = 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, RelationUtil.dimensionality(database)) * covarianceDet);
+ double fakt = 1.0 / Math.sqrt(MathUtil.powi(MathUtil.TWOPI, RelationUtil.dimensionality(database)) * covarianceDet);
// for each object compute probability and sum
double prob = 0;
for (DBIDIter iter = objids.iter(); iter.valid(); iter.advance()) {
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LBABOD.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LBABOD.java new file mode 100644 index 00000000..37b4d050 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LBABOD.java @@ -0,0 +1,288 @@ +package de.lmu.ifi.dbs.elki.algorithm.outlier; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDPair; +import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; +import de.lmu.ifi.dbs.elki.distance.similarityfunction.SimilarityFunction; +import de.lmu.ifi.dbs.elki.distance.similarityfunction.kernel.KernelMatrix; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.Logging.Level; +import de.lmu.ifi.dbs.elki.logging.LoggingConfiguration; +import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic; +import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; +import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ComparableMaxHeap; +import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ComparableMinHeap; +import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap; +import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ObjectHeap; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; + +/** + * Angle-Based Outlier Detection / Angle-Based Outlier Factor. + * + * LB-ABOD (lower-bound) version. Exact on the top k outliers, approximate on + * the remaining. + * + * Outlier detection using variance analysis on angles, especially for high + * dimensional data sets. + * + * H.-P. Kriegel, M. Schubert, and A. Zimek: Angle-Based Outlier Detection in + * High-dimensional Data. In: Proc. 14th ACM SIGKDD Int. Conf. on Knowledge + * Discovery and Data Mining (KDD '08), Las Vegas, NV, 2008. + * + * @author Matthias Schubert (Original Code) + * @author Erich Schubert (ELKIfication) + * + * @param <V> Vector type + */ +@Title("LB-ABOD: Lower Bounded Angle-Based Outlier Detection") +@Description("Outlier detection using variance analysis on angles, especially for high dimensional data sets.") +@Reference(authors = "H.-P. Kriegel, M. Schubert, and A. Zimek", title = "Angle-Based Outlier Detection in High-dimensional Data", booktitle = "Proc. 14th ACM SIGKDD Int. Conf. on Knowledge Discovery and Data Mining (KDD '08), Las Vegas, NV, 2008", url = "http://dx.doi.org/10.1145/1401890.1401946") +public class LBABOD<V extends NumberVector<?>> extends FastABOD<V> { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(LBABOD.class); + + /** + * Number of outliers to refine. + */ + protected int l; + + /** + * Actual constructor, with parameters. Fast mode (sampling). + * + * @param kernelFunction Kernel function to use + * @param k k parameter + * @param l Number of outliers to find exact + */ + public LBABOD(SimilarityFunction<? super V, DoubleDistance> kernelFunction, int k, int l) { + super(kernelFunction, k); + this.l = l; + } + + /** + * Run LB-ABOD on the data set. + * + * @param relation Relation to process + * @return Outlier detection result + */ + @Override + public OutlierResult run(Database db, Relation<V> relation) { + DBIDs ids = relation.getDBIDs(); + SimilarityQuery<V, DoubleDistance> sq = relation.getDatabase().getSimilarityQuery(relation, kernelFunction); + KernelMatrix kernelMatrix = new KernelMatrix(sq, relation, ids); + + // Output storage. + WritableDoubleDataStore abodvalues = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC); + DoubleMinMax minmaxabod = new DoubleMinMax(); + double max = 0.; + + // Storage for squared distances (will be reused!) + WritableDoubleDataStore sqDists = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT); + // Nearest neighbor heap (will be reused!) + ComparableMaxHeap<DoubleDBIDPair> nn = new ComparableMaxHeap<>(k); + + // Priority queue for candidates + ComparableMinHeap<DoubleDBIDPair> candidates = new ComparableMinHeap<>(relation.size()); + // get Candidate Ranking + for(DBIDIter pA = relation.iterDBIDs(); pA.valid(); pA.advance()) { + // Compute nearest neighbors and distances. + nn.clear(); + double simAA = kernelMatrix.getSimilarity(pA, pA); + // Sum of 1./(|AB|) and 1./(|AB|^2); for computing R2. + double sumid = 0., sumisqd = 0.; + for(DBIDIter nB = relation.iterDBIDs(); nB.valid(); nB.advance()) { + if(DBIDUtil.equal(nB, pA)) { + continue; + } + double simBB = kernelMatrix.getSimilarity(nB, nB); + double simAB = kernelMatrix.getSimilarity(pA, nB); + double sqdAB = simAA + simBB - simAB - simAB; + sqDists.putDouble(nB, sqdAB); + if(!(sqdAB > 0.)) { + continue; + } + sumid += 1. / Math.sqrt(sqdAB); + sumisqd += 1. / sqdAB; + // Update heap + if(nn.size() < k) { + nn.add(DBIDUtil.newPair(sqdAB, nB)); + } + else if(sqdAB < nn.peek().doubleValue()) { + nn.replaceTopElement(DBIDUtil.newPair(sqdAB, nB)); + } + } + + // Compute FastABOD approximation, adjust for lower bound. + // LB-ABOF is defined via a numerically unstable formula. + // Variance as E(X^2)-E(X)^2 suffers from catastrophic cancellation! + // TODO: ensure numerical precision! + double nnsum = 0., nnsumsq = 0., nnsumisqd = 0.; + for(ObjectHeap.UnsortedIter<DoubleDBIDPair> iB = nn.unsortedIter(); iB.valid(); iB.advance()) { + DoubleDBIDPair nB = iB.get(); + double sqdAB = nB.doubleValue(); + double simAB = kernelMatrix.getSimilarity(pA, nB); + if(!(sqdAB > 0.)) { + continue; + } + for(ObjectHeap.UnsortedIter<DoubleDBIDPair> iC = nn.unsortedIter(); iC.valid(); iC.advance()) { + DoubleDBIDPair nC = iC.get(); + if(DBIDUtil.compare(nC, nB) < 0) { + continue; + } + double sqdAC = nC.doubleValue(); + double simAC = kernelMatrix.getSimilarity(pA, nC); + if(!(sqdAC > 0.)) { + continue; + } + // Exploit bilinearity of scalar product: + // <B-A, C-A> = <B, C-A> - <A,C-A> + // = <B,C> - <B,A> - <A,C> + <A,A> + double simBC = kernelMatrix.getSimilarity(nB, nC); + double numerator = simBC - simAB - simAC + simAA; + double sqweight = 1. / (sqdAB * sqdAC); + double weight = Math.sqrt(sqweight); + double val = numerator * sqweight; + nnsum += val * weight; + nnsumsq += val * val * weight; + nnsumisqd += sqweight; + } + } + // Remaining weight, term R2: + double r2 = sumisqd * sumisqd - 2. * nnsumisqd; + double tmp = (2. * nnsum + r2) / (sumid * sumid); + double lbabof = 2. * nnsumsq / (sumid * sumid) - tmp * tmp; + + // Track maximum? + if(lbabof > max) { + max = lbabof; + } + abodvalues.putDouble(pA, lbabof); + candidates.add(DBIDUtil.newPair(lbabof, pA)); + } + minmaxabod.put(max); // Put maximum from approximate values. + + // refine Candidates + int refinements = 0; + DoubleMinHeap topscores = new DoubleMinHeap(l); + MeanVariance s = new MeanVariance(); + while(!candidates.isEmpty()) { + // Stop refining + if(topscores.size() >= k && candidates.peek().doubleValue() > topscores.peek()) { + break; + } + DoubleDBIDPair pA = candidates.poll(); + final double abof = computeABOF(relation, kernelMatrix, pA, s); + // Store refined score: + abodvalues.putDouble(pA, abof); + minmaxabod.put(abof); + // Update the heap tracking the top scores. + if(topscores.size() < k) { + topscores.add(abof); + } + else { + if(topscores.peek() > abof) { + topscores.replaceTopElement(abof); + } + } + refinements += 1; + } + if(LOG.isStatistics()) { + LoggingConfiguration.setVerbose(Level.VERYVERBOSE); + LOG.statistics(new LongStatistic("lb-abod.refinements", refinements)); + } + // Build result representation. + Relation<Double> scoreResult = new MaterializedRelation<>("Angle-based Outlier Detection", "abod-outlier", TypeUtil.DOUBLE, abodvalues, ids); + OutlierScoreMeta scoreMeta = new InvertedOutlierScoreMeta(minmaxabod.getMin(), minmaxabod.getMax(), 0.0, Double.POSITIVE_INFINITY); + return new OutlierResult(scoreMeta, scoreResult); + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector<?>> extends FastABOD.Parameterizer<V> { + /** + * Parameter to specify the number of outliers to compute exactly. + */ + public static final OptionID L_ID = new OptionID("abod.l", "Number of top outliers to compute."); + + /** + * Number of outliers to find. + */ + protected int l = 0; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + final IntParameter lP = new IntParameter(L_ID); + lP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(lP)) { + l = lP.getValue(); + } + } + + @Override + protected LBABOD<V> makeInstance() { + return new LBABOD<>(kernelFunction, k, l); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ODIN.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ODIN.java index f22cdeb7..a5b39146 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ODIN.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ODIN.java @@ -45,7 +45,7 @@ import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -110,19 +110,19 @@ public class ODIN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgorit double inc = 1. / (k - 1); double min = Double.POSITIVE_INFINITY, max = 0.0; // Process all objects - for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { // Find the nearest neighbors (using an index, if available!) KNNList<D> neighbors = knnq.getKNNForDBID(iter, k); // For each neighbor, except ourselves, increase the in-degree: - for (DBIDIter nei = neighbors.iter(); nei.valid(); nei.advance()) { - if (DBIDUtil.equal(iter, nei)) { + for(DBIDIter nei = neighbors.iter(); nei.valid(); nei.advance()) { + if(DBIDUtil.equal(iter, nei)) { continue; } final double value = scores.doubleValue(nei) + inc; - if (value < min) { + if(value < min) { min = value; } - if (value > max) { + if(value > max) { max = value; } scores.put(nei, value); @@ -178,8 +178,8 @@ public class ODIN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgorit // Since in a database context, the 1 nearest neighbor // will usually be the query object itself, we require // this value to be at least 2. - param.addConstraint(new GreaterConstraint(1)); - if (config.grab(param)) { + param.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); + if(config.grab(param)) { k = param.intValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OPTICSOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OPTICSOF.java index f6d46f57..b1ffae63 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OPTICSOF.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OPTICSOF.java @@ -1,26 +1,27 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier;
-/* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures - -Copyright (C) 2013 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ +
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures +
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team +
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version. +
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details. +
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
import java.util.ArrayList;
import java.util.List;
@@ -54,7 +55,7 @@ import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta; import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -122,7 +123,7 @@ public class OPTICSOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanc // Pass 1
// N_minpts(id) and core-distance(id)
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
KNNList<D> minptsNeighbours = knnQuery.getKNNForDBID(iditer, minpts);
D d = minptsNeighbours.getKNNDistance();
nMinPts.put(iditer, minptsNeighbours);
@@ -133,11 +134,11 @@ public class OPTICSOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanc // Pass 2
WritableDataStore<List<Double>> reachDistance = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, List.class);
WritableDoubleDataStore lrds = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
List<Double> core = new ArrayList<>();
double lrd = 0;
// TODO: optimize for double distances
- for (DistanceDBIDListIter<D> neighbor = nMinPts.get(iditer).iter(); neighbor.valid(); neighbor.advance()) {
+ for(DistanceDBIDListIter<D> neighbor = nMinPts.get(iditer).iter(); neighbor.valid(); neighbor.advance()) {
double coreDist = coreDistance.doubleValue(neighbor);
double dist = distQuery.distance(iditer, neighbor).doubleValue();
double rd = Math.max(coreDist, dist);
@@ -152,9 +153,9 @@ public class OPTICSOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanc // Pass 3
DoubleMinMax ofminmax = new DoubleMinMax();
WritableDoubleDataStore ofs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC);
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double of = 0;
- for (DBIDIter neighbor = nMinPts.get(iditer).iter(); neighbor.valid(); neighbor.advance()) {
+ for(DBIDIter neighbor = nMinPts.get(iditer).iter(); neighbor.valid(); neighbor.advance()) {
double lrd = lrds.doubleValue(iditer);
double lrdN = lrds.doubleValue(neighbor);
of = of + lrdN / lrd;
@@ -169,7 +170,7 @@ public class OPTICSOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanc OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(ofminmax.getMin(), ofminmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 1.0);
return new OutlierResult(scoreMeta, scoreResult);
}
-
+
@Override
public TypeInformation[] getInputTypeRestriction() {
return TypeUtil.array(getDistanceFunction().getInputTypeRestriction());
@@ -181,11 +182,11 @@ public class OPTICSOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanc }
/**
- * Parameterization class. - * - * @author Erich Schubert - * - * @apiviz.exclude + * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
*/
public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> {
protected int minpts = 0;
@@ -194,7 +195,7 @@ public class OPTICSOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanc protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final IntParameter param = new IntParameter(OPTICS.MINPTS_ID);
- param.addConstraint(new GreaterConstraint(1));
+ param.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
if(config.grab(param)) {
minpts = param.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ReferenceBasedOutlierDetection.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ReferenceBasedOutlierDetection.java index 092bbc45..d254c9a1 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ReferenceBasedOutlierDetection.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ReferenceBasedOutlierDetection.java @@ -56,7 +56,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
@@ -182,14 +182,14 @@ public class ReferenceBasedOutlierDetection<V extends NumberVector<?>, D extends }
// compute maximum density
double maxDensity = 0.0;
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double dens = rbod_score.doubleValue(iditer);
if(dens > maxDensity) {
maxDensity = dens;
}
}
// compute ROS
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double score = 1 - (rbod_score.doubleValue(iditer) / maxDensity);
rbod_score.putDouble(iditer, score);
}
@@ -218,7 +218,7 @@ public class ReferenceBasedOutlierDetection<V extends NumberVector<?>, D extends protected DistanceDBIDList<D> computeDistanceVector(V refPoint, Relation<V> database, DistanceQuery<V, D> distFunc) {
// TODO: optimize for double distances?
GenericDistanceDBIDList<D> referenceDists = new GenericDistanceDBIDList<>(database.size());
- for(DBIDIter iditer = database.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = database.iterDBIDs(); iditer.valid(); iditer.advance()) {
referenceDists.add(distFunc.distance(iditer, refPoint), iditer);
}
referenceDists.sort();
@@ -319,7 +319,7 @@ public class ReferenceBasedOutlierDetection<V extends NumberVector<?>, D extends protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final IntParameter pK = new IntParameter(K_ID);
- pK.addConstraint(new GreaterConstraint(1));
+ pK.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
if(config.grab(pK)) {
k = pK.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/SimpleCOP.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/SimpleCOP.java index 38820ab7..72a727a5 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/SimpleCOP.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/SimpleCOP.java @@ -60,7 +60,7 @@ import de.lmu.ifi.dbs.elki.utilities.FormatUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; @@ -123,7 +123,7 @@ public class SimpleCOP<V extends NumberVector<?>, D extends NumberDistance<D, ?> {// compute neighbors of each db object FiniteProgress progressLocalPCA = LOG.isVerbose() ? new FiniteProgress("Correlation Outlier Probabilities", data.size(), LOG) : null; double sqrt2 = Math.sqrt(2.0); - for (DBIDIter id = data.iterDBIDs(); id.valid(); id.advance()) { + for(DBIDIter id = data.iterDBIDs(); id.valid(); id.advance()) { KNNList<D> neighbors = knnQuery.getKNNForDBID(id, k + 1); ModifiableDBIDs nids = DBIDUtil.newArray(neighbors); nids.remove(id); @@ -147,11 +147,11 @@ public class SimpleCOP<V extends NumberVector<?>, D extends NumberDistance<D, ?> cop_sol.put(id, depsol); - if (progressLocalPCA != null) { + if(progressLocalPCA != null) { progressLocalPCA.incrementProcessed(LOG); } } - if (progressLocalPCA != null) { + if(progressLocalPCA != null) { progressLocalPCA.ensureCompleted(LOG); } } @@ -218,12 +218,12 @@ public class SimpleCOP<V extends NumberVector<?>, D extends NumberDistance<D, ?> protected void makeOptions(Parameterization config) { super.makeOptions(config); IntParameter kP = new IntParameter(K_ID); - kP.addConstraint(new GreaterConstraint(0)); - if (config.grab(kP)) { + kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(kP)) { k = kP.intValue(); } ObjectParameter<PCAFilteredRunner<V>> pcaP = new ObjectParameter<>(PCARUNNER_ID, PCAFilteredRunner.class, PCAFilteredRunner.class); - if (config.grab(pcaP)) { + if(config.grab(pcaP)) { pca = pcaP.instantiateClass(config); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/ALOCI.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/ALOCI.java index d48679a9..f978365e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/ALOCI.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/ALOCI.java @@ -141,7 +141,7 @@ public class ALOCI<O extends NumberVector<?>, D extends NumberDistance<D, ?>> ex public OutlierResult run(Database database, Relation<O> relation) {
final int dim = RelationUtil.dimensionality(relation);
- final Random random = rnd.getRandom();
+ final Random random = rnd.getSingleThreadedRandom();
FiniteProgress progressPreproc = LOG.isVerbose() ? new FiniteProgress("Build aLOCI quadtress", g, LOG) : null;
// Compute extend of dataset.
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/FlexibleLOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/FlexibleLOF.java index 80f60e8b..2508b6b0 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/FlexibleLOF.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/FlexibleLOF.java @@ -64,7 +64,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; @@ -581,14 +581,14 @@ public class FlexibleLOF<O, D extends NumberDistance<D, ?>> extends AbstractAlgo super.makeOptions(config); final IntParameter pK = new IntParameter(KREF_ID); - pK.addConstraint(new GreaterConstraint(1)); + pK.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); if (config.grab(pK)) { krefer = pK.intValue(); } final IntParameter pK2 = new IntParameter(KREACH_ID); pK2.setOptional(true); - pK2.addConstraint(new GreaterConstraint(1)); + pK2.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); if (config.grab(pK2)) { kreach = pK2.intValue(); } else { diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/INFLO.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/INFLO.java index ae297a3c..28fcf01b 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/INFLO.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/INFLO.java @@ -53,7 +53,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -142,7 +142,7 @@ public class INFLO<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBa // density
WritableDoubleDataStore density = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT);
// init knns and rnns
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
knns.put(iditer, DBIDUtil.newArray());
rnns.put(iditer, DBIDUtil.newArray());
}
@@ -150,10 +150,10 @@ public class INFLO<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBa // TODO: use kNN preprocessor?
KNNQuery<O, D> knnQuery = database.getKNNQuery(distFunc, k, DatabaseQuery.HINT_HEAVY_USE);
- for (DBIDIter id = relation.iterDBIDs(); id.valid(); id.advance()) {
+ for(DBIDIter id = relation.iterDBIDs(); id.valid(); id.advance()) {
// if not visited count=0
int count = rnns.get(id).size();
- if (!processedIDs.contains(id)) {
+ if(!processedIDs.contains(id)) {
// TODO: use exactly k neighbors?
KNNList<D> list = knnQuery.getKNNForDBID(id, k);
knns.get(id).addDBIDs(list);
@@ -162,8 +162,8 @@ public class INFLO<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBa }
ModifiableDBIDs s = knns.get(id);
- for (DBIDIter q = knns.get(id).iter(); q.valid(); q.advance()) {
- if (!processedIDs.contains(q)) {
+ for(DBIDIter q = knns.get(id).iter(); q.valid(); q.advance()) {
+ if(!processedIDs.contains(q)) {
// TODO: use exactly k neighbors?
KNNList<D> listQ = knnQuery.getKNNForDBID(q, k);
knns.get(q).addDBIDs(listQ);
@@ -171,13 +171,13 @@ public class INFLO<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBa processedIDs.add(q);
}
- if (knns.get(q).contains(id)) {
+ if(knns.get(q).contains(id)) {
rnns.get(q).add(id);
rnns.get(id).add(q);
count++;
}
}
- if (count >= s.size() * m) {
+ if(count >= s.size() * m) {
pruned.add(id);
}
}
@@ -186,15 +186,15 @@ public class INFLO<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBa // IF Object is pruned INFLO=1.0
DoubleMinMax inflominmax = new DoubleMinMax();
WritableDoubleDataStore inflos = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
- for (DBIDIter id = relation.iterDBIDs(); id.valid(); id.advance()) {
- if (!pruned.contains(id)) {
+ for(DBIDIter id = relation.iterDBIDs(); id.valid(); id.advance()) {
+ if(!pruned.contains(id)) {
ModifiableDBIDs knn = knns.get(id);
ModifiableDBIDs rnn = rnns.get(id);
double denP = density.doubleValue(id);
knn.addDBIDs(rnn);
Mean mean = new Mean();
- for (DBIDIter iter = knn.iter(); iter.valid(); iter.advance()) {
+ for(DBIDIter iter = knn.iter(); iter.valid(); iter.advance()) {
mean.put(density.doubleValue(iter));
}
double den = mean.getMean() / denP;
@@ -203,7 +203,7 @@ public class INFLO<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBa inflominmax.put(den);
}
- if (pruned.contains(id)) {
+ if(pruned.contains(id)) {
inflos.putDouble(id, 1.0);
inflominmax.put(1.0);
}
@@ -241,14 +241,14 @@ public class INFLO<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBa protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final DoubleParameter mP = new DoubleParameter(M_ID, 1.0);
- mP.addConstraint(new GreaterConstraint(0.0));
- if (config.grab(mP)) {
+ mP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ if(config.grab(mP)) {
m = mP.doubleValue();
}
final IntParameter kP = new IntParameter(K_ID);
- kP.addConstraint(new GreaterConstraint(1));
- if (config.grab(kP)) {
+ kP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
+ if(config.grab(kP)) {
k = kP.intValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDF.java index 4a86e93d..e5049877 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDF.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDF.java @@ -55,6 +55,7 @@ import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.math.MathUtil; import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.GaussianKernelDensityFunction; import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.KernelDensityFunction; import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta; @@ -62,7 +63,7 @@ import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -149,8 +150,8 @@ public class LDF<O extends NumberVector<?>, D extends NumberDistance<D, ?>> exte // "HEAVY" flag for KNN Query since it is used more than once KNNQuery<O, D> knnq = QueryUtil.getKNNQuery(relation, getDistanceFunction(), k, DatabaseQuery.HINT_HEAVY_USE, DatabaseQuery.HINT_OPTIMIZED_ONLY, DatabaseQuery.HINT_NO_CACHE); // No optimized kNN query - use a preprocessor! - if (!(knnq instanceof PreprocessorKNNQuery)) { - if (stepprog != null) { + if(!(knnq instanceof PreprocessorKNNQuery)) { + if(stepprog != null) { stepprog.beginStep(1, "Materializing neighborhoods w.r.t. distance function.", LOG); } MaterializeKNNPreprocessor<O, D> preproc = new MaterializeKNNPreprocessor<>(relation, getDistanceFunction(), k); @@ -160,43 +161,46 @@ public class LDF<O extends NumberVector<?>, D extends NumberDistance<D, ?>> exte } // Compute LDEs - if (stepprog != null) { + if(stepprog != null) { stepprog.beginStep(2, "Computing LDEs.", LOG); } WritableDoubleDataStore ldes = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); FiniteProgress densProgress = LOG.isVerbose() ? new FiniteProgress("Densities", ids.size(), LOG) : null; - for (DBIDIter it = ids.iter(); it.valid(); it.advance()) { + for(DBIDIter it = ids.iter(); it.valid(); it.advance()) { final KNNList<D> neighbors = knnq.getKNNForDBID(it, k); double sum = 0.0; int count = 0; - if (neighbors instanceof DoubleDistanceKNNList) { + if(neighbors instanceof DoubleDistanceKNNList) { // Fast version for double distances - for (DoubleDistanceDBIDListIter neighbor = ((DoubleDistanceKNNList) neighbors).iter(); neighbor.valid(); neighbor.advance()) { - if (DBIDUtil.equal(neighbor, it)) { + for(DoubleDistanceDBIDListIter neighbor = ((DoubleDistanceKNNList) neighbors).iter(); neighbor.valid(); neighbor.advance()) { + if(DBIDUtil.equal(neighbor, it)) { continue; } final double nkdist = ((DoubleDistanceKNNList) knnq.getKNNForDBID(neighbor, k)).doubleKNNDistance(); - if (nkdist > 0.) { + if(nkdist > 0.) { final double v = Math.max(nkdist, neighbor.doubleDistance()) / (h * nkdist); - sum += kernel.density(v) / Math.pow(h * nkdist, dim); + sum += kernel.density(v) / MathUtil.powi(h * nkdist, dim); count++; - } else { + } + else { sum = Double.POSITIVE_INFINITY; count++; break; } } - } else { - for (DistanceDBIDListIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { - if (DBIDUtil.equal(neighbor, it)) { + } + else { + for(DistanceDBIDListIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { + if(DBIDUtil.equal(neighbor, it)) { continue; } final double nkdist = knnq.getKNNForDBID(neighbor, k).getKNNDistance().doubleValue(); - if (nkdist > 0.) { + if(nkdist > 0.) { final double v = Math.max(nkdist, neighbor.getDistance().doubleValue()) / (h * nkdist); - sum += kernel.density(v) / Math.pow(h * nkdist, dim); + sum += kernel.density(v) / MathUtil.powi(h * nkdist, dim); count++; - } else { + } + else { sum = Double.POSITIVE_INFINITY; count++; break; @@ -204,16 +208,16 @@ public class LDF<O extends NumberVector<?>, D extends NumberDistance<D, ?>> exte } } ldes.putDouble(it, sum / count); - if (densProgress != null) { + if(densProgress != null) { densProgress.incrementProcessed(LOG); } } - if (densProgress != null) { + if(densProgress != null) { densProgress.ensureCompleted(LOG); } // Compute local density factors. - if (stepprog != null) { + if(stepprog != null) { stepprog.beginStep(3, "Computing LDFs.", LOG); } WritableDoubleDataStore ldfs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC); @@ -221,14 +225,14 @@ public class LDF<O extends NumberVector<?>, D extends NumberDistance<D, ?>> exte DoubleMinMax lofminmax = new DoubleMinMax(); FiniteProgress progressLOFs = LOG.isVerbose() ? new FiniteProgress("Local Density Factors", ids.size(), LOG) : null; - for (DBIDIter it = ids.iter(); it.valid(); it.advance()) { + for(DBIDIter it = ids.iter(); it.valid(); it.advance()) { final double lrdp = ldes.doubleValue(it); final KNNList<D> neighbors = knnq.getKNNForDBID(it, k); double sum = 0.0; int count = 0; - for (DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { + for(DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { // skip the point itself - if (DBIDUtil.equal(neighbor, it)) { + if(DBIDUtil.equal(neighbor, it)) { continue; } sum += ldes.doubleValue(neighbor); @@ -241,15 +245,15 @@ public class LDF<O extends NumberVector<?>, D extends NumberDistance<D, ?>> exte // update minimum and maximum lofminmax.put(ldf); - if (progressLOFs != null) { + if(progressLOFs != null) { progressLOFs.incrementProcessed(LOG); } } - if (progressLOFs != null) { + if(progressLOFs != null) { progressLOFs.ensureCompleted(LOG); } - if (stepprog != null) { + if(stepprog != null) { stepprog.setCompleted(LOG); } @@ -327,23 +331,23 @@ public class LDF<O extends NumberVector<?>, D extends NumberDistance<D, ?>> exte super.makeOptions(config); final IntParameter pK = new IntParameter(K_ID); - pK.addConstraint(new GreaterConstraint(1)); - if (config.grab(pK)) { + pK.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); + if(config.grab(pK)) { k = pK.getValue(); } ObjectParameter<KernelDensityFunction> kernelP = new ObjectParameter<>(KERNEL_ID, KernelDensityFunction.class, GaussianKernelDensityFunction.class); - if (config.grab(kernelP)) { + if(config.grab(kernelP)) { kernel = kernelP.instantiateClass(config); } DoubleParameter hP = new DoubleParameter(H_ID); - if (config.grab(hP)) { + if(config.grab(hP)) { h = hP.doubleValue(); } DoubleParameter cP = new DoubleParameter(C_ID, 0.1); - if (config.grab(cP)) { + if(config.grab(cP)) { c = cP.doubleValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDOF.java index 80ed3f68..36c70b48 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDOF.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDOF.java @@ -53,7 +53,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -80,7 +80,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @Title("LDOF: Local Distance-Based Outlier Factor")
@Description("Local outlier detection appraoch suitable for scattered data by averaging the kNN distance over all k nearest neighbors")
@Reference(authors = "K. Zhang, M. Hutter, H. Jin", title = "A New Local Distance-Based Outlier Detection Approach for Scattered Real-World Data", booktitle = "Proc. 13th Pacific-Asia Conference on Advances in Knowledge Discovery and Data Mining (PAKDD 2009), Bangkok, Thailand, 2009", url = "http://dx.doi.org/10.1007/978-3-642-01307-2_84")
-@Alias({"de.lmu.ifi.dbs.elki.algorithm.outlier.LDOF"})
+@Alias({ "de.lmu.ifi.dbs.elki.algorithm.outlier.LDOF" })
public class LDOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<O, D, OutlierResult> implements OutlierAlgorithm {
/**
* The logger for this class.
@@ -138,15 +138,16 @@ public class LDOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas FiniteProgress progressLDOFs = LOG.isVerbose() ? new FiniteProgress("LDOF_SCORE for objects", relation.size(), LOG) : null;
Mean dxp = new Mean(), Dxp = new Mean();
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
KNNList<D> neighbors = knnQuery.getKNNForDBID(iditer, k);
// skip the point itself
- dxp.reset(); Dxp.reset();
+ dxp.reset();
+ Dxp.reset();
// TODO: optimize for double distances
- for (DistanceDBIDListIter<D> neighbor1 = neighbors.iter(); neighbor1.valid(); neighbor1.advance()) {
+ for(DistanceDBIDListIter<D> neighbor1 = neighbors.iter(); neighbor1.valid(); neighbor1.advance()) {
if(!DBIDUtil.equal(neighbor1, iditer)) {
dxp.put(neighbor1.getDistance().doubleValue());
- for (DistanceDBIDListIter<D> neighbor2 = neighbors.iter(); neighbor2.valid(); neighbor2.advance()) {
+ for(DistanceDBIDListIter<D> neighbor2 = neighbors.iter(); neighbor2.valid(); neighbor2.advance()) {
if(!DBIDUtil.equal(neighbor1, neighbor2) && !DBIDUtil.equal(neighbor2, iditer)) {
Dxp.put(distFunc.distance(neighbor1, neighbor2).doubleValue());
}
@@ -199,7 +200,7 @@ public class LDOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final IntParameter kP = new IntParameter(K_ID);
- kP.addConstraint(new GreaterConstraint(1));
+ kP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
if(config.grab(kP)) {
k = kP.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOF.java index 302dafe6..28166c75 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOF.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOF.java @@ -30,6 +30,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.DoubleDataStore; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; @@ -59,7 +60,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -129,8 +130,8 @@ public class LOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBase // "HEAVY" flag for knn query since it is used more than once KNNQuery<O, D> knnq = database.getKNNQuery(dq, k, DatabaseQuery.HINT_HEAVY_USE, DatabaseQuery.HINT_OPTIMIZED_ONLY, DatabaseQuery.HINT_NO_CACHE); // No optimized kNN query - use a preprocessor! - if (!(knnq instanceof PreprocessorKNNQuery)) { - if (stepprog != null) { + if(!(knnq instanceof PreprocessorKNNQuery)) { + if(stepprog != null) { stepprog.beginStep(1, "Materializing LOF neighborhoods.", LOG); } MaterializeKNNPreprocessor<O, D> preproc = new MaterializeKNNPreprocessor<>(relation, getDistanceFunction(), k); @@ -139,109 +140,131 @@ public class LOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBase DBIDs ids = relation.getDBIDs(); // Compute LRDs - if (stepprog != null) { + if(stepprog != null) { stepprog.beginStep(2, "Computing LRDs.", LOG); } WritableDoubleDataStore lrds = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); - { - FiniteProgress lrdsProgress = LOG.isVerbose() ? new FiniteProgress("LRD", ids.size(), LOG) : null; - for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { - final KNNList<D> neighbors = knnq.getKNNForDBID(iter, k); - double sum = 0.0; - int count = 0; - if (neighbors instanceof DoubleDistanceKNNList) { - // Fast version for double distances - for (DoubleDistanceDBIDListIter neighbor = ((DoubleDistanceKNNList) neighbors).iter(); neighbor.valid(); neighbor.advance()) { - if (DBIDUtil.equal(neighbor, iter)) { - continue; - } - KNNList<D> neighborsNeighbors = knnq.getKNNForDBID(neighbor, k); - final double nkdist; - if (neighborsNeighbors instanceof DoubleDistanceKNNList) { - nkdist = ((DoubleDistanceKNNList) neighborsNeighbors).doubleKNNDistance(); - } else { - nkdist = neighborsNeighbors.getKNNDistance().doubleValue(); - } - sum += Math.max(neighbor.doubleDistance(), nkdist); - count++; - } - } else { - for (DistanceDBIDListIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { - if (DBIDUtil.equal(neighbor, iter)) { - continue; - } - KNNList<D> neighborsNeighbors = knnq.getKNNForDBID(neighbor, k); - sum += Math.max(neighbor.getDistance().doubleValue(), neighborsNeighbors.getKNNDistance().doubleValue()); - count++; - } - } - // Avoid division by 0 - final double lrd = (sum > 0) ? (count / sum) : Double.POSITIVE_INFINITY; - lrds.putDouble(iter, lrd); - if (lrdsProgress != null) { - lrdsProgress.incrementProcessed(LOG); - } - } - if (lrdsProgress != null) { - lrdsProgress.ensureCompleted(LOG); - } - } + computeLRDs(knnq, ids, lrds); // compute LOF_SCORE of each db object - if (stepprog != null) { + if(stepprog != null) { stepprog.beginStep(3, "Computing LOFs.", LOG); } - WritableDoubleDataStore lofs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC); + DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC); + WritableDoubleDataStore lofs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_DB); // track the maximum value for normalization. DoubleMinMax lofminmax = new DoubleMinMax(); - { - FiniteProgress progressLOFs = LOG.isVerbose() ? new FiniteProgress("LOF_SCORE for objects", ids.size(), LOG) : null; - for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { - final double lof; - final double lrdp = lrds.doubleValue(iter); - final KNNList<D> neighbors = knnq.getKNNForDBID(iter, k); - if (!Double.isInfinite(lrdp)) { - double sum = 0.0; - int count = 0; - for (DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { - // skip the point itself - if (DBIDUtil.equal(neighbor, iter)) { - continue; - } - final double val = lrds.doubleValue(neighbor); - sum += val; - count++; - if (Double.isInfinite(val)) { - break; - } + computeLOFScores(knnq, ids, lrds, lofs, lofminmax); + + if(stepprog != null) { + stepprog.setCompleted(LOG); + } + + // Build result representation. + Relation<Double> scoreResult = new MaterializedRelation<>("Local Outlier Factor", "lof-outlier", TypeUtil.DOUBLE, lofs, ids); + OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(lofminmax.getMin(), lofminmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 1.0); + return new OutlierResult(scoreMeta, scoreResult); + } + + /** + * Compute local reachability distances. + * + * @param knnq KNN query + * @param ids IDs to process + * @param lrds Reachability storage + */ + private void computeLRDs(KNNQuery<O, D> knnq, DBIDs ids, WritableDoubleDataStore lrds) { + FiniteProgress lrdsProgress = LOG.isVerbose() ? new FiniteProgress("LRD", ids.size(), LOG) : null; + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + final KNNList<D> neighbors = knnq.getKNNForDBID(iter, k); + double sum = 0.0; + int count = 0; + if(neighbors instanceof DoubleDistanceKNNList) { + // Fast version for double distances + for(DoubleDistanceDBIDListIter neighbor = ((DoubleDistanceKNNList) neighbors).iter(); neighbor.valid(); neighbor.advance()) { + if(DBIDUtil.equal(neighbor, iter)) { + continue; + } + KNNList<D> neighborsNeighbors = knnq.getKNNForDBID(neighbor, k); + final double nkdist; + if(neighborsNeighbors instanceof DoubleDistanceKNNList) { + nkdist = ((DoubleDistanceKNNList) neighborsNeighbors).doubleKNNDistance(); + } + else { + nkdist = neighborsNeighbors.getKNNDistance().doubleValue(); } - lof = sum / (lrdp * count); - } else { - lof = 1.0; + sum += Math.max(neighbor.doubleDistance(), nkdist); + count++; } - lofs.putDouble(iter, lof); - // update minimum and maximum - lofminmax.put(lof); - - if (progressLOFs != null) { - progressLOFs.incrementProcessed(LOG); + } + else { + for(DistanceDBIDListIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { + if(DBIDUtil.equal(neighbor, iter)) { + continue; + } + KNNList<D> neighborsNeighbors = knnq.getKNNForDBID(neighbor, k); + sum += Math.max(neighbor.getDistance().doubleValue(), neighborsNeighbors.getKNNDistance().doubleValue()); + count++; } } - if (progressLOFs != null) { - progressLOFs.ensureCompleted(LOG); + // Avoid division by 0 + final double lrd = (sum > 0) ? (count / sum) : Double.POSITIVE_INFINITY; + lrds.putDouble(iter, lrd); + if(lrdsProgress != null) { + lrdsProgress.incrementProcessed(LOG); } } - - if (stepprog != null) { - stepprog.setCompleted(LOG); + if(lrdsProgress != null) { + lrdsProgress.ensureCompleted(LOG); } + } - // Build result representation. - Relation<Double> scoreResult = new MaterializedRelation<>("Local Outlier Factor", "lof-outlier", TypeUtil.DOUBLE, lofs, ids); - OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(lofminmax.getMin(), lofminmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 1.0); - OutlierResult result = new OutlierResult(scoreMeta, scoreResult); + /** + * Compute local outlier factors. + * + * @param knnq KNN query + * @param ids IDs to process + * @param lrds Local reachability distances + * @param lofs Local outlier factor storage + * @param lofminmax Score minimum/maximum tracker + */ + private void computeLOFScores(KNNQuery<O, D> knnq, DBIDs ids, DoubleDataStore lrds, WritableDoubleDataStore lofs, DoubleMinMax lofminmax) { + FiniteProgress progressLOFs = LOG.isVerbose() ? new FiniteProgress("LOF_SCORE for objects", ids.size(), LOG) : null; + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + final double lof; + final double lrdp = lrds.doubleValue(iter); + final KNNList<D> neighbors = knnq.getKNNForDBID(iter, k); + if(!Double.isInfinite(lrdp)) { + double sum = 0.0; + int count = 0; + for(DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { + // skip the point itself + if(DBIDUtil.equal(neighbor, iter)) { + continue; + } + final double val = lrds.doubleValue(neighbor); + sum += val; + count++; + if(Double.isInfinite(val)) { + break; + } + } + lof = sum / (lrdp * count); + } + else { + lof = 1.0; + } + lofs.putDouble(iter, lof); + // update minimum and maximum + lofminmax.put(lof); - return result; + if(progressLOFs != null) { + progressLOFs.incrementProcessed(LOG); + } + } + if(progressLOFs != null) { + progressLOFs.ensureCompleted(LOG); + } } @Override @@ -279,8 +302,8 @@ public class LOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBase super.makeOptions(config); final IntParameter pK = new IntParameter(K_ID); - pK.addConstraint(new GreaterConstraint(1)); - if (config.grab(pK)) { + pK.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); + if(config.grab(pK)) { k = pK.getValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LoOP.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LoOP.java index 15ff690a..525d45f2 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LoOP.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LoOP.java @@ -64,7 +64,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -183,26 +183,28 @@ public class LoOP<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<O protected Pair<KNNQuery<O, D>, KNNQuery<O, D>> getKNNQueries(Database database, Relation<O> relation, StepProgress stepprog) { KNNQuery<O, D> knnComp; KNNQuery<O, D> knnReach; - if (comparisonDistanceFunction == reachabilityDistanceFunction || comparisonDistanceFunction.equals(reachabilityDistanceFunction)) { + if(comparisonDistanceFunction == reachabilityDistanceFunction || comparisonDistanceFunction.equals(reachabilityDistanceFunction)) { // We need each neighborhood twice - use "HEAVY" flag. knnComp = QueryUtil.getKNNQuery(relation, comparisonDistanceFunction, Math.max(kreach, kcomp), DatabaseQuery.HINT_HEAVY_USE, DatabaseQuery.HINT_OPTIMIZED_ONLY, DatabaseQuery.HINT_NO_CACHE); // No optimized kNN query - use a preprocessor! - if (knnComp == null) { - if (stepprog != null) { + if(knnComp == null) { + if(stepprog != null) { stepprog.beginStep(1, "Materializing neighborhoods with respect to reference neighborhood distance function.", LOG); } MaterializeKNNPreprocessor<O, D> preproc = new MaterializeKNNPreprocessor<>(relation, comparisonDistanceFunction, kcomp); database.addIndex(preproc); DistanceQuery<O, D> cdq = database.getDistanceQuery(relation, comparisonDistanceFunction); knnComp = preproc.getKNNQuery(cdq, kreach, DatabaseQuery.HINT_HEAVY_USE); - } else { - if (stepprog != null) { + } + else { + if(stepprog != null) { stepprog.beginStep(1, "Optimized neighborhoods provided by database.", LOG); } } knnReach = knnComp; - } else { - if (stepprog != null) { + } + else { + if(stepprog != null) { stepprog.beginStep(1, "Not materializing distance functions, since we request each DBID once only.", LOG); } knnComp = QueryUtil.getKNNQuery(relation, comparisonDistanceFunction, kreach); @@ -228,10 +230,10 @@ public class LoOP<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<O KNNQuery<O, D> knnReach = pair.getSecond(); // Assert we got something - if (knnComp == null) { + if(knnComp == null) { throw new AbortException("No kNN queries supported by database for comparison distance function."); } - if (knnReach == null) { + if(knnReach == null) { throw new AbortException("No kNN queries supported by database for density estimation distance function."); } @@ -239,34 +241,35 @@ public class LoOP<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<O WritableDoubleDataStore pdists = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); Mean mean = new Mean(); {// computing PRDs - if (stepprog != null) { + if(stepprog != null) { stepprog.beginStep(3, "Computing pdists", LOG); } FiniteProgress prdsProgress = LOG.isVerbose() ? new FiniteProgress("pdists", relation.size(), LOG) : null; - for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { final KNNList<D> neighbors = knnReach.getKNNForDBID(iditer, kreach); mean.reset(); // use first kref neighbors as reference set int ks = 0; // TODO: optimize for double distances - if (neighbors instanceof DoubleDistanceKNNList) { - for (DoubleDistanceDBIDListIter neighbor = ((DoubleDistanceKNNList) neighbors).iter(); neighbor.valid(); neighbor.advance()) { - if (objectIsInKNN || !DBIDUtil.equal(neighbor, iditer)) { + if(neighbors instanceof DoubleDistanceKNNList) { + for(DoubleDistanceDBIDListIter neighbor = ((DoubleDistanceKNNList) neighbors).iter(); neighbor.valid(); neighbor.advance()) { + if(objectIsInKNN || !DBIDUtil.equal(neighbor, iditer)) { final double d = neighbor.doubleDistance(); mean.put(d * d); ks++; - if (ks >= kreach) { + if(ks >= kreach) { break; } } } - } else { - for (DistanceDBIDListIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { - if (objectIsInKNN || !DBIDUtil.equal(neighbor, iditer)) { + } + else { + for(DistanceDBIDListIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { + if(objectIsInKNN || !DBIDUtil.equal(neighbor, iditer)) { double d = neighbor.getDistance().doubleValue(); mean.put(d * d); ks++; - if (ks >= kreach) { + if(ks >= kreach) { break; } } @@ -274,7 +277,7 @@ public class LoOP<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<O } double pdist = lambda * Math.sqrt(mean.getMean()); pdists.putDouble(iditer, pdist); - if (prdsProgress != null) { + if(prdsProgress != null) { prdsProgress.incrementProcessed(LOG); } } @@ -283,62 +286,62 @@ public class LoOP<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<O WritableDoubleDataStore plofs = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); MeanVariance mvplof = new MeanVariance(); {// compute LOOP_SCORE of each db object - if (stepprog != null) { + if(stepprog != null) { stepprog.beginStep(4, "Computing PLOF", LOG); } FiniteProgress progressPLOFs = LOG.isVerbose() ? new FiniteProgress("PLOFs for objects", relation.size(), LOG) : null; MeanVariance mv = new MeanVariance(); - for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { final KNNList<D> neighbors = knnComp.getKNNForDBID(iditer, kcomp); mv.reset(); // use first kref neighbors as comparison set. int ks = 0; - for (DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { - if (objectIsInKNN || !DBIDUtil.equal(neighbor, iditer)) { + for(DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { + if(objectIsInKNN || !DBIDUtil.equal(neighbor, iditer)) { mv.put(pdists.doubleValue(neighbor)); ks++; - if (ks >= kcomp) { + if(ks >= kcomp) { break; } } } double plof = Math.max(pdists.doubleValue(iditer) / mv.getMean(), 1.0); - if (Double.isNaN(plof) || Double.isInfinite(plof)) { + if(Double.isNaN(plof) || Double.isInfinite(plof)) { plof = 1.0; } plofs.putDouble(iditer, plof); mvplof.put((plof - 1.0) * (plof - 1.0)); - if (progressPLOFs != null) { + if(progressPLOFs != null) { progressPLOFs.incrementProcessed(LOG); } } } double nplof = lambda * Math.sqrt(mvplof.getMean()); - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { LOG.verbose("nplof normalization factor is " + nplof + " " + mvplof.getMean() + " " + mvplof.getSampleStddev()); } // Compute final LoOP values. WritableDoubleDataStore loops = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); {// compute LOOP_SCORE of each db object - if (stepprog != null) { + if(stepprog != null) { stepprog.beginStep(5, "Computing LoOP scores", LOG); } FiniteProgress progressLOOPs = LOG.isVerbose() ? new FiniteProgress("LoOP for objects", relation.size(), LOG) : null; - for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { loops.putDouble(iditer, NormalDistribution.erf((plofs.doubleValue(iditer) - 1) / (nplof * sqrt2))); - if (progressLOOPs != null) { + if(progressLOOPs != null) { progressLOOPs.incrementProcessed(LOG); } } } - if (stepprog != null) { + if(stepprog != null) { stepprog.setCompleted(LOG); } @@ -351,9 +354,10 @@ public class LoOP<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<O @Override public TypeInformation[] getInputTypeRestriction() { final TypeInformation type; - if (reachabilityDistanceFunction.equals(comparisonDistanceFunction)) { + if(reachabilityDistanceFunction.equals(comparisonDistanceFunction)) { type = reachabilityDistanceFunction.getInputTypeRestriction(); - } else { + } + else { type = new CombinedTypeInformation(reachabilityDistanceFunction.getInputTypeRestriction(), comparisonDistanceFunction.getInputTypeRestriction()); } return TypeUtil.array(type); @@ -401,34 +405,35 @@ public class LoOP<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<O protected void makeOptions(Parameterization config) { super.makeOptions(config); final IntParameter kcompP = new IntParameter(KCOMP_ID); - kcompP.addConstraint(new GreaterConstraint(1)); - if (config.grab(kcompP)) { + kcompP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); + if(config.grab(kcompP)) { kcomp = kcompP.intValue(); } final ObjectParameter<DistanceFunction<O, D>> compDistP = new ObjectParameter<>(COMPARISON_DISTANCE_FUNCTION_ID, DistanceFunction.class, EuclideanDistanceFunction.class); - if (config.grab(compDistP)) { + if(config.grab(compDistP)) { comparisonDistanceFunction = compDistP.instantiateClass(config); } final IntParameter kreachP = new IntParameter(KREACH_ID); - kreachP.addConstraint(new GreaterConstraint(1)); + kreachP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); kreachP.setOptional(true); - if (config.grab(kreachP)) { + if(config.grab(kreachP)) { kreach = kreachP.intValue(); - } else { + } + else { kreach = kcomp; } final ObjectParameter<DistanceFunction<O, D>> reachDistP = new ObjectParameter<>(REACHABILITY_DISTANCE_FUNCTION_ID, DistanceFunction.class, true); - if (config.grab(reachDistP)) { + if(config.grab(reachDistP)) { reachabilityDistanceFunction = reachDistP.instantiateClass(config); } // TODO: make default 1.0? final DoubleParameter lambdaP = new DoubleParameter(LAMBDA_ID, 2.0); - lambdaP.addConstraint(new GreaterConstraint(0.0)); - if (config.grab(lambdaP)) { + lambdaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); + if(config.grab(lambdaP)) { lambda = lambdaP.doubleValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimpleKernelDensityLOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimpleKernelDensityLOF.java index 2ff7534a..b990ef35 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimpleKernelDensityLOF.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimpleKernelDensityLOF.java @@ -55,13 +55,14 @@ import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.math.MathUtil; import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.EpanechnikovKernelDensityFunction; import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.KernelDensityFunction; import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; @@ -151,7 +152,7 @@ public class SimpleKernelDensityLOF<O extends NumberVector<?>, D extends NumberD } double max = ((DoubleDistanceKNNList)knnq.getKNNForDBID(neighbor, k)).doubleKNNDistance(); final double v = neighbor.doubleDistance() / max; - sum += kernel.density(v) / Math.pow(max, dim); + sum += kernel.density(v) / MathUtil.powi(max, dim); count++; } } else { @@ -161,7 +162,7 @@ public class SimpleKernelDensityLOF<O extends NumberVector<?>, D extends NumberD } double max = knnq.getKNNForDBID(neighbor, k).getKNNDistance().doubleValue(); final double v = neighbor.getDistance().doubleValue() / max; - sum += kernel.density(v) / Math.pow(max, dim); + sum += kernel.density(v) / MathUtil.powi(max, dim); count++; } } @@ -268,7 +269,7 @@ public class SimpleKernelDensityLOF<O extends NumberVector<?>, D extends NumberD super.makeOptions(config); final IntParameter pK = new IntParameter(LOF.Parameterizer.K_ID); - pK.addConstraint(new GreaterConstraint(1)); + pK.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); if (config.grab(pK)) { k = pK.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimplifiedLOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimplifiedLOF.java index 413eaca1..d54b053f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimplifiedLOF.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimplifiedLOF.java @@ -57,7 +57,7 @@ import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta; import de.lmu.ifi.dbs.elki.utilities.Alias; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -118,8 +118,8 @@ public class SimplifiedLOF<O, D extends NumberDistance<D, ?>> extends AbstractDi // "HEAVY" flag for KNN Query since it is used more than once KNNQuery<O, D> knnq = QueryUtil.getKNNQuery(relation, getDistanceFunction(), k, DatabaseQuery.HINT_HEAVY_USE, DatabaseQuery.HINT_OPTIMIZED_ONLY, DatabaseQuery.HINT_NO_CACHE); // No optimized kNN query - use a preprocessor! - if (!(knnq instanceof PreprocessorKNNQuery)) { - if (stepprog != null) { + if(!(knnq instanceof PreprocessorKNNQuery)) { + if(stepprog != null) { stepprog.beginStep(1, "Materializing neighborhoods w.r.t. distance function.", LOG); } MaterializeKNNPreprocessor<O, D> preproc = new MaterializeKNNPreprocessor<>(relation, getDistanceFunction(), k); @@ -129,27 +129,28 @@ public class SimplifiedLOF<O, D extends NumberDistance<D, ?>> extends AbstractDi } // Compute LRDs - if (stepprog != null) { + if(stepprog != null) { stepprog.beginStep(2, "Computing densities.", LOG); } WritableDoubleDataStore dens = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); FiniteProgress densProgress = LOG.isVerbose() ? new FiniteProgress("Densities", ids.size(), LOG) : null; - for (DBIDIter it = ids.iter(); it.valid(); it.advance()) { + for(DBIDIter it = ids.iter(); it.valid(); it.advance()) { final KNNList<D> neighbors = knnq.getKNNForDBID(it, k); double sum = 0.0; int count = 0; - if (neighbors instanceof DoubleDistanceKNNList) { + if(neighbors instanceof DoubleDistanceKNNList) { // Fast version for double distances - for (DoubleDistanceDBIDListIter neighbor = ((DoubleDistanceKNNList) neighbors).iter(); neighbor.valid(); neighbor.advance()) { - if (DBIDUtil.equal(neighbor, it)) { + for(DoubleDistanceDBIDListIter neighbor = ((DoubleDistanceKNNList) neighbors).iter(); neighbor.valid(); neighbor.advance()) { + if(DBIDUtil.equal(neighbor, it)) { continue; } sum += neighbor.doubleDistance(); count++; } - } else { - for (DistanceDBIDListIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { - if (DBIDUtil.equal(neighbor, it)) { + } + else { + for(DistanceDBIDListIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { + if(DBIDUtil.equal(neighbor, it)) { continue; } sum += neighbor.getDistance().doubleValue(); @@ -159,16 +160,16 @@ public class SimplifiedLOF<O, D extends NumberDistance<D, ?>> extends AbstractDi // Avoid division by 0 final double lrd = (sum > 0) ? (count / sum) : 0; dens.putDouble(it, lrd); - if (densProgress != null) { + if(densProgress != null) { densProgress.incrementProcessed(LOG); } } - if (densProgress != null) { + if(densProgress != null) { densProgress.ensureCompleted(LOG); } // compute LOF_SCORE of each db object - if (stepprog != null) { + if(stepprog != null) { stepprog.beginStep(3, "Computing SLOFs.", LOG); } WritableDoubleDataStore lofs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC); @@ -176,38 +177,39 @@ public class SimplifiedLOF<O, D extends NumberDistance<D, ?>> extends AbstractDi DoubleMinMax lofminmax = new DoubleMinMax(); FiniteProgress progressLOFs = LOG.isVerbose() ? new FiniteProgress("Simple LOF scores.", ids.size(), LOG) : null; - for (DBIDIter it = ids.iter(); it.valid(); it.advance()) { + for(DBIDIter it = ids.iter(); it.valid(); it.advance()) { final double lrdp = dens.doubleValue(it); final double lof; - if (lrdp > 0) { + if(lrdp > 0) { final KNNList<D> neighbors = knnq.getKNNForDBID(it, k); double sum = 0.0; int count = 0; - for (DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { + for(DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { // skip the point itself - if (DBIDUtil.equal(neighbor, it)) { + if(DBIDUtil.equal(neighbor, it)) { continue; } sum += dens.doubleValue(neighbor); count++; } lof = sum / (count * lrdp); - } else { + } + else { lof = 1.0; } lofs.putDouble(it, lof); // update minimum and maximum lofminmax.put(lof); - if (progressLOFs != null) { + if(progressLOFs != null) { progressLOFs.incrementProcessed(LOG); } } - if (progressLOFs != null) { + if(progressLOFs != null) { progressLOFs.ensureCompleted(LOG); } - if (stepprog != null) { + if(stepprog != null) { stepprog.setCompleted(LOG); } @@ -250,8 +252,8 @@ public class SimplifiedLOF<O, D extends NumberDistance<D, ?>> extends AbstractDi super.makeOptions(config); final IntParameter pK = new IntParameter(LOF.Parameterizer.K_ID); - pK.addConstraint(new GreaterConstraint(1)); - if (config.grab(pK)) { + pK.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); + if(config.grab(pK)) { k = pK.getValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/ExternalDoubleOutlierScore.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/ExternalDoubleOutlierScore.java index 0d0f7303..757b80ad 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/ExternalDoubleOutlierScore.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/ExternalDoubleOutlierScore.java @@ -52,6 +52,7 @@ import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta; import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; import de.lmu.ifi.dbs.elki.utilities.FileUtil; +import de.lmu.ifi.dbs.elki.utilities.FormatUtil; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; @@ -175,7 +176,7 @@ public class ExternalDoubleOutlierScore extends AbstractAlgorithm<OutlierResult> if(!Double.isNaN(score)) { throw new AbortException("Score pattern matched twice: previous value " + score + " second value: " + str); } - score = Double.parseDouble(str.substring(ms.end())); + score = FormatUtil.parseDouble(str.substring(ms.end())); } } if(id != null && !Double.isNaN(score)) { diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/FeatureBagging.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/FeatureBagging.java index 22c20fc3..5b681106 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/FeatureBagging.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/FeatureBagging.java @@ -54,8 +54,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -136,12 +135,12 @@ public class FeatureBagging extends AbstractAlgorithm<OutlierResult> implements final int dbdim = RelationUtil.dimensionality(relation); final int mindim = dbdim >> 1; final int maxdim = dbdim - 1; - final Random rand = rnd.getRandom(); + final Random rand = rnd.getSingleThreadedRandom(); ArrayList<OutlierResult> results = new ArrayList<>(num); { FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("LOF iterations", num, LOG) : null; - for (int i = 0; i < num; i++) { + for(int i = 0; i < num; i++) { BitSet dimset = randomSubspace(dbdim, mindim, maxdim, rand); SubspaceEuclideanDistanceFunction df = new SubspaceEuclideanDistanceFunction(dimset); LOF<NumberVector<?>, DoubleDistance> lof = new LOF<>(k, df); @@ -149,18 +148,18 @@ public class FeatureBagging extends AbstractAlgorithm<OutlierResult> implements // run LOF and collect the result OutlierResult result = lof.run(database, relation); results.add(result); - if (prog != null) { + if(prog != null) { prog.incrementProcessed(LOG); } } - if (prog != null) { + if(prog != null) { prog.ensureCompleted(LOG); } } WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); DoubleMinMax minmax = new DoubleMinMax(); - if (breadth) { + if(breadth) { FiniteProgress cprog = LOG.isVerbose() ? new FiniteProgress("Combining results", relation.size(), LOG) : null; Pair<DBIDIter, Relation<Double>>[] IDVectorOntoScoreVector = Pair.newPairArray(results.size()); @@ -168,55 +167,57 @@ public class FeatureBagging extends AbstractAlgorithm<OutlierResult> implements // We need to initialize them now be able to iterate them "in parallel". { int i = 0; - for (OutlierResult r : results) { + for(OutlierResult r : results) { IDVectorOntoScoreVector[i] = new Pair<DBIDIter, Relation<Double>>(r.getOrdering().iter(relation.getDBIDs()).iter(), r.getScores()); i++; } } // Iterating over the *lines* of the AS_t(i)-matrix. - for (int i = 0; i < relation.size(); i++) { + for(int i = 0; i < relation.size(); i++) { // Iterating over the elements of a line (breadth-first). - for (Pair<DBIDIter, Relation<Double>> pair : IDVectorOntoScoreVector) { + for(Pair<DBIDIter, Relation<Double>> pair : IDVectorOntoScoreVector) { DBIDIter iter = pair.first; // Always true if every algorithm returns a complete result (one score // for every DBID). - if (iter.valid()) { + if(iter.valid()) { double score = pair.second.get(iter); - if (Double.isNaN(scores.doubleValue(iter))) { + if(Double.isNaN(scores.doubleValue(iter))) { scores.putDouble(iter, score); minmax.put(score); } iter.advance(); - } else { + } + else { LOG.warning("Incomplete result: Iterator does not contain |DB| DBIDs"); } } // Progress does not take the initial mapping into account. - if (cprog != null) { + if(cprog != null) { cprog.incrementProcessed(LOG); } } - if (cprog != null) { + if(cprog != null) { cprog.ensureCompleted(LOG); } - } else { + } + else { FiniteProgress cprog = LOG.isVerbose() ? new FiniteProgress("Combining results", relation.size(), LOG) : null; - for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) { + for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) { double sum = 0.0; - for (OutlierResult r : results) { + for(OutlierResult r : results) { final Double s = r.getScores().get(iter); - if (s != null && !Double.isNaN(s)) { + if(s != null && !Double.isNaN(s)) { sum += s; } } scores.putDouble(iter, sum); minmax.put(sum); - if (cprog != null) { + if(cprog != null) { cprog.incrementProcessed(LOG); } } - if (cprog != null) { + if(cprog != null) { cprog.ensureCompleted(LOG); } } @@ -237,13 +238,13 @@ public class FeatureBagging extends AbstractAlgorithm<OutlierResult> implements BitSet dimset = new BitSet(); // Fill with all dimensions int[] dims = new int[alldim]; - for (int d = 0; d < alldim; d++) { + for(int d = 0; d < alldim; d++) { dims[d] = d; } // Target dimensionality: int subdim = mindim + rand.nextInt(maxdim - mindim); // Shrink the subspace to the destination size - for (int d = 0; d < alldim - subdim; d++) { + for(int d = 0; d < alldim - subdim; d++) { int s = rand.nextInt(alldim - d); dimset.set(dims[s]); dims[s] = dims[alldim - d - 1]; @@ -317,21 +318,21 @@ public class FeatureBagging extends AbstractAlgorithm<OutlierResult> implements protected void makeOptions(Parameterization config) { super.makeOptions(config); final IntParameter pK = new IntParameter(LOF.Parameterizer.K_ID); - pK.addConstraint(new GreaterConstraint(1)); - if (config.grab(pK)) { + pK.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); + if(config.grab(pK)) { k = pK.getValue(); } IntParameter numP = new IntParameter(NUM_ID); - numP.addConstraint(new GreaterEqualConstraint(1)); - if (config.grab(numP)) { + numP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(numP)) { num = numP.getValue(); } Flag breadthF = new Flag(BREADTH_ID); - if (config.grab(breadthF)) { + if(config.grab(breadthF)) { breadth = breadthF.getValue(); } RandomParameter rndP = new RandomParameter(SEED_ID); - if (config.grab(rndP)) { + if(config.grab(rndP)) { rnd = rndP.getValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/HiCS.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/HiCS.java index 69608293..f92a8b80 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/HiCS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/HiCS.java @@ -72,7 +72,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -174,9 +174,9 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe final DBIDs ids = relation.getDBIDs();
ArrayList<ArrayDBIDs> subspaceIndex = buildOneDimIndexes(relation);
- Set<HiCSSubspace> subspaces = calculateSubspaces(relation, subspaceIndex, rnd.getRandom());
+ Set<HiCSSubspace> subspaces = calculateSubspaces(relation, subspaceIndex, rnd.getSingleThreadedRandom());
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("Number of high-contrast subspaces: " + subspaces.size());
}
List<Relation<Double>> results = new ArrayList<>();
@@ -185,8 +185,8 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe // run outlier detection and collect the result
// TODO extend so that any outlierAlgorithm can be used (use materialized
// relation instead of SubspaceEuclideanDistanceFunction?)
- for (HiCSSubspace dimset : subspaces) {
- if (LOG.isVerbose()) {
+ for(HiCSSubspace dimset : subspaces) {
+ if(LOG.isVerbose()) {
LOG.verbose("Performing outlier detection in subspace " + dimset);
}
@@ -196,22 +196,22 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe // run LOF and collect the result
OutlierResult result = outlierAlgorithm.run(pdb);
results.add(result.getScores());
- if (prog != null) {
+ if(prog != null) {
prog.incrementProcessed(LOG);
}
}
- if (prog != null) {
+ if(prog != null) {
prog.ensureCompleted(LOG);
}
WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
DoubleMinMax minmax = new DoubleMinMax();
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double sum = 0.0;
- for (Relation<Double> r : results) {
+ for(Relation<Double> r : results) {
final Double s = r.get(iditer);
- if (s != null && !Double.isNaN(s)) {
+ if(s != null && !Double.isNaN(s)) {
sum += s;
}
}
@@ -237,7 +237,7 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe ArrayList<ArrayDBIDs> subspaceIndex = new ArrayList<>(dim + 1);
SortDBIDsBySingleDimension comp = new VectorUtil.SortDBIDsBySingleDimension(relation);
- for (int i = 0; i < dim; i++) {
+ for(int i = 0; i < dim; i++) {
ArrayModifiableDBIDs amDBIDs = DBIDUtil.newArray(relation.getDBIDs());
comp.setDimension(i);
amDBIDs.sort(comp);
@@ -258,7 +258,7 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe final int dbdim = RelationUtil.dimensionality(relation);
FiniteProgress dprog = LOG.isVerbose() ? new FiniteProgress("Subspace dimensionality", dbdim, LOG) : null;
- if (dprog != null) {
+ if(dprog != null) {
dprog.setProcessed(2, LOG);
}
@@ -266,31 +266,31 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe TopBoundedHeap<HiCSSubspace> dDimensionalList = new TopBoundedHeap<>(cutoff, HiCSSubspace.SORT_BY_CONTRAST_ASC);
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Generating two-element subsets", (dbdim * (dbdim - 1)) >> 1, LOG) : null;
// compute two-element sets of subspaces
- for (int i = 0; i < dbdim; i++) {
- for (int j = i + 1; j < dbdim; j++) {
+ for(int i = 0; i < dbdim; i++) {
+ for(int j = i + 1; j < dbdim; j++) {
HiCSSubspace ts = new HiCSSubspace();
ts.set(i);
ts.set(j);
calculateContrast(relation, ts, subspaceIndex, random);
dDimensionalList.add(ts);
- if (prog != null) {
+ if(prog != null) {
prog.incrementProcessed(LOG);
}
}
}
- if (prog != null) {
+ if(prog != null) {
prog.ensureCompleted(LOG);
}
IndefiniteProgress qprog = LOG.isVerbose() ? new IndefiniteProgress("Testing subspace candidates", LOG) : null;
- for (int d = 3; !dDimensionalList.isEmpty(); d++) {
- if (dprog != null) {
+ for(int d = 3; !dDimensionalList.isEmpty(); d++) {
+ if(dprog != null) {
dprog.setProcessed(d, LOG);
}
// result now contains all d-dimensional sets of subspaces
ArrayList<HiCSSubspace> candidateList = new ArrayList<>(dDimensionalList.size());
- for (Heap<HiCSSubspace>.UnorderedIter it = dDimensionalList.unorderedIter(); it.valid(); it.advance()) {
+ for(Heap<HiCSSubspace>.UnorderedIter it = dDimensionalList.unorderedIter(); it.valid(); it.advance()) {
subspaceList.add(it.get());
candidateList.add(it.get());
}
@@ -299,39 +299,39 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe Collections.sort(candidateList, HiCSSubspace.SORT_BY_SUBSPACE);
// TODO: optimize APRIORI style, by not even computing the bit set or?
- for (int i = 0; i < candidateList.size() - 1; i++) {
- for (int j = i + 1; j < candidateList.size(); j++) {
+ for(int i = 0; i < candidateList.size() - 1; i++) {
+ for(int j = i + 1; j < candidateList.size(); j++) {
HiCSSubspace set1 = candidateList.get(i);
HiCSSubspace set2 = candidateList.get(j);
HiCSSubspace joinedSet = new HiCSSubspace();
joinedSet.or(set1);
joinedSet.or(set2);
- if (joinedSet.cardinality() != d) {
+ if(joinedSet.cardinality() != d) {
continue;
}
calculateContrast(relation, joinedSet, subspaceIndex, random);
dDimensionalList.add(joinedSet);
- if (qprog != null) {
+ if(qprog != null) {
qprog.incrementProcessed(LOG);
}
}
}
// Prune
- for (HiCSSubspace cand : candidateList) {
- for (Heap<HiCSSubspace>.UnorderedIter it = dDimensionalList.unorderedIter(); it.valid(); it.advance()) {
- if (it.get().contrast > cand.contrast) {
+ for(HiCSSubspace cand : candidateList) {
+ for(Heap<HiCSSubspace>.UnorderedIter it = dDimensionalList.unorderedIter(); it.valid(); it.advance()) {
+ if(it.get().contrast > cand.contrast) {
subspaceList.remove(cand);
break;
}
}
}
}
- if (qprog != null) {
+ if(qprog != null) {
qprog.setCompleted(LOG);
}
- if (dprog != null) {
+ if(dprog != null) {
dprog.setProcessed(dbdim, LOG);
dprog.ensureCompleted(LOG);
}
@@ -353,17 +353,17 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe int retries = 0;
double deviationSum = 0.0;
- for (int i = 0; i < m; i++) {
+ for(int i = 0; i < m; i++) {
// Choose a random set bit.
int chosen = -1;
- for (int tmp = random.nextInt(card); tmp >= 0; tmp--) {
+ for(int tmp = random.nextInt(card); tmp >= 0; tmp--) {
chosen = subspace.nextSetBit(chosen + 1);
}
// initialize sample
DBIDs conditionalSample = relation.getDBIDs();
- for (int j = subspace.nextSetBit(0); j >= 0; j = subspace.nextSetBit(j + 1)) {
- if (j == chosen) {
+ for(int j = subspace.nextSetBit(0); j >= 0; j = subspace.nextSetBit(j + 1)) {
+ if(j == chosen) {
continue;
}
ArrayDBIDs sortedIndices = subspaceIndex.get(j);
@@ -371,20 +371,21 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe // initialize index block
DBIDArrayIter iter = sortedIndices.iter();
iter.seek(random.nextInt(relation.size() - windowsize));
- for (int k = 0; k < windowsize; k++, iter.advance()) {
+ for(int k = 0; k < windowsize; k++, iter.advance()) {
indexBlock.add(iter); // select index block
}
conditionalSample = DBIDUtil.intersection(conditionalSample, indexBlock);
}
- if (conditionalSample.size() < 10) {
+ if(conditionalSample.size() < 10) {
retries++;
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
LOG.debug("Sample size very small. Retry no. " + retries);
}
- if (retries >= MAX_RETRIES) {
+ if(retries >= MAX_RETRIES) {
LOG.warning("Too many retries, for small samples: " + retries);
- } else {
+ }
+ else {
i--;
continue;
}
@@ -393,7 +394,7 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe double[] sampleValues = new double[conditionalSample.size()];
{
int l = 0;
- for (DBIDIter iter = conditionalSample.iter(); iter.valid(); iter.advance()) {
+ for(DBIDIter iter = conditionalSample.iter(); iter.valid(); iter.advance()) {
sampleValues[l] = relation.get(iter).doubleValue(chosen);
l++;
}
@@ -402,23 +403,23 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe double[] fullValues = new double[relation.size()];
{
int l = 0;
- for (DBIDIter iter = subspaceIndex.get(chosen).iter(); iter.valid(); iter.advance()) {
+ for(DBIDIter iter = subspaceIndex.get(chosen).iter(); iter.valid(); iter.advance()) {
fullValues[l] = relation.get(iter).doubleValue(chosen);
l++;
}
}
double contrast = statTest.deviation(fullValues, sampleValues);
- if (Double.isNaN(contrast)) {
+ if(Double.isNaN(contrast)) {
i--;
LOG.warning("Contrast was NaN");
continue;
}
deviationSum += contrast;
- if (prog != null) {
+ if(prog != null) {
prog.incrementProcessed(LOG);
}
}
- if (prog != null) {
+ if(prog != null) {
prog.ensureCompleted(LOG);
}
subspace.contrast = deviationSum / m;
@@ -464,7 +465,7 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe public String toString() {
StringBuilder buf = new StringBuilder();
buf.append("[contrast=").append(contrast);
- for (int i = nextSetBit(0); i >= 0; i = nextSetBit(i + 1)) {
+ for(int i = nextSetBit(0); i >= 0; i = nextSetBit(i + 1)) {
buf.append(' ').append(i + 1);
}
buf.append(']');
@@ -477,7 +478,7 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe public static final Comparator<HiCSSubspace> SORT_BY_CONTRAST_ASC = new Comparator<HiCSSubspace>() {
@Override
public int compare(HiCSSubspace o1, HiCSSubspace o2) {
- if (o1.contrast == o2.contrast) {
+ if(o1.contrast == o2.contrast) {
return 0;
}
return o1.contrast > o2.contrast ? 1 : -1;
@@ -490,7 +491,7 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe public static final Comparator<HiCSSubspace> SORT_BY_CONTRAST_DESC = new Comparator<HiCSSubspace>() {
@Override
public int compare(HiCSSubspace o1, HiCSSubspace o2) {
- if (o1.contrast == o2.contrast) {
+ if(o1.contrast == o2.contrast) {
return 0;
}
return o1.contrast < o2.contrast ? 1 : -1;
@@ -505,10 +506,11 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe public int compare(HiCSSubspace o1, HiCSSubspace o2) {
int dim1 = o1.nextSetBit(0);
int dim2 = o2.nextSetBit(0);
- while (dim1 >= 0 && dim2 >= 0) {
- if (dim1 < dim2) {
+ while(dim1 >= 0 && dim2 >= 0) {
+ if(dim1 < dim2) {
return -1;
- } else if (dim1 > dim2) {
+ }
+ else if(dim1 > dim2) {
return 1;
}
dim1 = o1.nextSetBit(dim1 + 1);
@@ -597,35 +599,35 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final IntParameter mP = new IntParameter(M_ID, 50);
- mP.addConstraint(new GreaterConstraint(1));
- if (config.grab(mP)) {
+ mP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
+ if(config.grab(mP)) {
m = mP.intValue();
}
final DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 0.1);
- alphaP.addConstraint(new GreaterConstraint(0));
- if (config.grab(alphaP)) {
+ alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ if(config.grab(alphaP)) {
alpha = alphaP.doubleValue();
}
final ObjectParameter<OutlierAlgorithm> algoP = new ObjectParameter<>(ALGO_ID, OutlierAlgorithm.class, LOF.class);
- if (config.grab(algoP)) {
+ if(config.grab(algoP)) {
outlierAlgorithm = algoP.instantiateClass(config);
}
final ObjectParameter<GoodnessOfFitTest> testP = new ObjectParameter<>(TEST_ID, GoodnessOfFitTest.class, KolmogorovSmirnovTest.class);
- if (config.grab(testP)) {
+ if(config.grab(testP)) {
statTest = testP.instantiateClass(config);
}
final IntParameter cutoffP = new IntParameter(LIMIT_ID, 100);
- cutoffP.addConstraint(new GreaterConstraint(1));
- if (config.grab(cutoffP)) {
+ cutoffP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
+ if(config.grab(cutoffP)) {
cutoff = cutoffP.intValue();
}
final RandomParameter rndP = new RandomParameter(SEED_ID);
- if (config.grab(rndP)) {
+ if(config.grab(rndP)) {
rnd = rndP.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuRandomWalkEC.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuRandomWalkEC.java index c8efe4da..85524b4e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuRandomWalkEC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuRandomWalkEC.java @@ -56,7 +56,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -149,7 +149,7 @@ public class CTLuRandomWalkEC<N, D extends NumberDistance<D, ?>> extends Abstrac }
final double e;
final D distance = distFunc.distance(id, n);
- heap.add(distance, n);
+ heap.insert(distance, n);
double dist = distance.doubleValue();
if(dist == 0) {
LOG.warning("Zero distances are not supported - skipping: " + DBIDUtil.toString(id) + " " + DBIDUtil.toString(n));
@@ -296,7 +296,7 @@ public class CTLuRandomWalkEC<N, D extends NumberDistance<D, ?>> extends Abstrac */
protected void configK(Parameterization config) {
final IntParameter param = new IntParameter(K_ID);
- param.addConstraint(new GreaterEqualConstraint(1));
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(param)) {
k = param.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/TrimmedMeanApproach.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/TrimmedMeanApproach.java index e07ce480..1a1f9a82 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/TrimmedMeanApproach.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/TrimmedMeanApproach.java @@ -1,26 +1,27 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial;
-/* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures - -Copyright (C) 2013 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ +
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures +
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team +
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version. +
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details. +
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
import java.util.Arrays;
@@ -50,15 +51,15 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
/**
* A Trimmed Mean Approach to Finding Spatial Outliers.
*
- * Outliers are defined by their value deviation from a trimmed mean of the neighbors.
+ * Outliers are defined by their value deviation from a trimmed mean of the
+ * neighbors.
*
* <p>
* Reference: <br>
@@ -116,7 +117,7 @@ public class TrimmedMeanApproach<N> extends AbstractNeighborhoodOutlier<N> { WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Computing trimmed means", relation.size(), LOG) : null;
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
DBIDs neighbors = npred.getNeighborDBIDs(iditer);
int num = 0;
double[] values = new double[neighbors.size()];
@@ -161,7 +162,7 @@ public class TrimmedMeanApproach<N> extends AbstractNeighborhoodOutlier<N> { double[] ei = new double[relation.size()];
{
int i = 0;
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
ei[i] = errors.doubleValue(iditer);
i++;
}
@@ -180,7 +181,7 @@ public class TrimmedMeanApproach<N> extends AbstractNeighborhoodOutlier<N> { }
// calculate score
DoubleMinMax minmax = new DoubleMinMax();
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double score = Math.abs(errors.doubleValue(iditer)) * 0.6745 / median_dev_from_median;
scores.putDouble(iditer, score);
minmax.put(score);
@@ -228,8 +229,8 @@ public class TrimmedMeanApproach<N> extends AbstractNeighborhoodOutlier<N> { protected void makeOptions(Parameterization config) {
super.makeOptions(config);
DoubleParameter pP = new DoubleParameter(P_ID);
- pP.addConstraint(new GreaterConstraint(0.0)); - pP.addConstraint(new LessConstraint(0.5));
+ pP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ pP.addConstraint(CommonConstraints.LESS_THAN_HALF_DOUBLE);
if(config.grab(pP)) {
p = pP.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExtendedNeighborhood.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExtendedNeighborhood.java index c4fc4407..c93b10cb 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExtendedNeighborhood.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExtendedNeighborhood.java @@ -38,7 +38,7 @@ import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; @@ -224,7 +224,7 @@ public class ExtendedNeighborhood extends AbstractPrecomputedNeighborhood { */ public static int getParameterSteps(Parameterization config) { final IntParameter param = new IntParameter(STEPS_ID); - param.addConstraint(new GreaterEqualConstraint(1)); + param.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(param)) { return param.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExternalNeighborhood.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExternalNeighborhood.java index 96896bd8..33b5010a 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExternalNeighborhood.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExternalNeighborhood.java @@ -161,8 +161,8 @@ public class ExternalNeighborhood extends AbstractPrecomputedNeighborhood { if(olq != null) { LabelList label = olq.get(iditer); if(label != null) { - for(String lbl : label) { - lblmap.put(lbl, DBIDUtil.deref(iditer)); + for(int i = 0; i < label.size(); i++) { + lblmap.put(label.get(i), DBIDUtil.deref(iditer)); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/LinearWeightedExtendedNeighborhood.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/LinearWeightedExtendedNeighborhood.java index 05bf2f18..4d6ec635 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/LinearWeightedExtendedNeighborhood.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/LinearWeightedExtendedNeighborhood.java @@ -38,7 +38,7 @@ import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; @@ -216,7 +216,7 @@ public class LinearWeightedExtendedNeighborhood implements WeightedNeighborSetPr */ public static int getParameterSteps(Parameterization config) { final IntParameter param = new IntParameter(STEPS_ID); - param.addConstraint(new GreaterEqualConstraint(1)); + param.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(param)) { return param.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OUTRES.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OUTRES.java index ae04fef4..c21542da 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OUTRES.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OUTRES.java @@ -55,6 +55,7 @@ import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.math.MathUtil; import de.lmu.ifi.dbs.elki.math.MeanVariance; import de.lmu.ifi.dbs.elki.math.statistics.distribution.GammaDistribution; import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.EpanechnikovKernelDensityFunction; @@ -368,7 +369,7 @@ public class OUTRES<V extends NumberVector<?>> extends AbstractAlgorithm<Outlier */ protected double optimalBandwidth(int dim) { // Pi in the publication is redundant and cancels out! - double hopt = 8 * GammaDistribution.gamma(dim / 2.0 + 1) * (dim + 4) * Math.pow(2, dim); + double hopt = 8 * GammaDistribution.gamma(dim / 2.0 + 1) * (dim + 4) * MathUtil.powi(2, dim); return hopt * Math.pow(relation.size(), (-1. / (dim + 4))); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OutRankS1.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OutRankS1.java index 96c8875f..3e248bfa 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OutRankS1.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OutRankS1.java @@ -49,7 +49,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; @@ -186,7 +186,7 @@ public class OutRankS1 extends AbstractAlgorithm<OutlierResult> implements Outli algorithm = algP.instantiateClass(config); } DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 0.25); - alphaP.addConstraint(new GreaterConstraint(0)); + alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); if (config.grab(alphaP)) { alpha = alphaP.doubleValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java index b2255e67..489f811b 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java @@ -31,10 +31,10 @@ import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; -import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; @@ -44,7 +44,6 @@ import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDPair; import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.distance.similarityfunction.SharedNearestNeighborSimilarityFunction; @@ -52,7 +51,9 @@ import de.lmu.ifi.dbs.elki.distance.similarityfunction.SimilarityFunction; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; -import de.lmu.ifi.dbs.elki.result.ResultHierarchy; +import de.lmu.ifi.dbs.elki.math.Mean; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta; import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; @@ -65,9 +66,10 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; @@ -91,7 +93,6 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * @param <V> the type of NumberVector handled by this Algorithm * @param <D> distance type */ -// todo arthur comment @Title("SOD: Subspace outlier degree") @Description("Outlier Detection in Axis-Parallel Subspaces of High Dimensional Data") @Reference(authors = "H.-P. Kriegel, P. Kröger, E. Schubert, A. Zimek", title = "Outlier Detection in Axis-Parallel Subspaces of High Dimensional Data", booktitle = "Proceedings of the 13th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD), Bangkok, Thailand, 2009", url = "http://dx.doi.org/10.1007/978-3-642-01307-2") @@ -102,50 +103,39 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte private static final Logging LOG = Logging.getLogger(SOD.class); /** - * Parameter to specify the number of shared nearest neighbors to be - * considered for learning the subspace properties., must be an integer - * greater than 0. - */ - public static final OptionID KNN_ID = new OptionID("sod.knn", "The number of most snn-similar objects to use as reference set for learning the subspace properties."); - - /** - * Parameter to indicate the multiplier for the discriminance value for - * discerning small from large variances. - */ - public static final OptionID ALPHA_ID = new OptionID("sod.alpha", "The multiplier for the discriminance value for discerning small from large variances."); - - /** - * Parameter for the similarity function. - */ - public static final OptionID SIM_ID = new OptionID("sod.similarity", "The similarity function used for the neighborhood set."); - - /** - * Holds the value of {@link #KNN_ID}. + * Neighborhood size. */ private int knn; /** - * Holds the value of {@link #ALPHA_ID}. + * Alpha (discriminance value). */ private double alpha; /** - * The similarity function {@link #SIM_ID}. + * Similarity function to use. */ private SimilarityFunction<V, D> similarityFunction; /** + * Report models. + */ + private boolean models; + + /** * Constructor with parameters. * * @param knn knn value * @param alpha Alpha parameter * @param similarityFunction Shared nearest neighbor similarity function + * @param models Report generated models */ - public SOD(int knn, double alpha, SimilarityFunction<V, D> similarityFunction) { + public SOD(int knn, double alpha, SimilarityFunction<V, D> similarityFunction, boolean models) { super(); this.knn = knn; this.alpha = alpha; this.similarityFunction = similarityFunction; + this.models = models; } /** @@ -157,26 +147,55 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte public OutlierResult run(Relation<V> relation) { SimilarityQuery<V, D> snnInstance = similarityFunction.instantiate(relation); FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Assigning Subspace Outlier Degree", relation.size(), LOG) : null; - WritableDataStore<SODModel<?>> sod_models = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, SODModel.class); + final WritableDoubleDataStore sod_scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); + WritableDataStore<SODModel> sod_models = null; + if (models) { // Models requested + sod_models = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, SODModel.class); + } DoubleMinMax minmax = new DoubleMinMax(); for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) { if (progress != null) { progress.incrementProcessed(LOG); } - DBIDs knnList = getNearestNeighbors(relation, snnInstance, iter); - SODModel<V> model = new SODModel<>(relation, knnList, alpha, relation.get(iter)); - sod_models.put(iter, model); - minmax.put(model.getSod()); + DBIDs neighborhood = getNearestNeighbors(relation, snnInstance, iter); + + Vector center; + BitSet weightVector; + double sod; + if (neighborhood.size() > 0) { + center = Centroid.make(relation, neighborhood); + // Note: per-dimension variances; no covariances. + double[] variances = computePerDimensionVariances(relation, center, neighborhood); + double expectationOfVariance = Mean.of(variances); + weightVector = new BitSet(variances.length); + for (int d = 0; d < variances.length; d++) { + if (variances[d] < alpha * expectationOfVariance) { + weightVector.set(d, true); + } + } + sod = subspaceOutlierDegree(relation.get(iter), center, weightVector); + } else { + center = relation.get(iter).getColumnVector(); + weightVector = null; + sod = 0.; + } + + if (sod_models != null) { + sod_models.put(iter, new SODModel(center, weightVector)); + } + sod_scores.putDouble(iter, sod); + minmax.put(sod); } if (progress != null) { progress.ensureCompleted(LOG); } // combine results. - Relation<SODModel<?>> models = new MaterializedRelation<>("Subspace Outlier Model", "sod-outlier", new SimpleTypeInformation<SODModel<?>>(SODModel.class), sod_models, relation.getDBIDs()); OutlierScoreMeta meta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax()); - OutlierResult sodResult = new OutlierResult(meta, new SODProxyScoreResult(models, relation.getDBIDs())); - // also add the models. - sodResult.addChildResult(models); + OutlierResult sodResult = new OutlierResult(meta, new MaterializedRelation<>("Subspace Outlier Degree", "sod-outlier", TypeUtil.DOUBLE, sod_scores, relation.getDBIDs())); + if (sod_models != null) { + Relation<SODModel> models = new MaterializedRelation<>("Subspace Outlier Model", "sod-outlier", new SimpleTypeInformation<>(SODModel.class), sod_models, relation.getDBIDs()); + sodResult.addChildResult(models); + } return sodResult; } @@ -186,6 +205,8 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte * <p/> * The query object is excluded from the knn list. * + * FIXME: move this to the database layer. + * * @param relation the database holding the objects * @param simQ similarity function * @param queryObject the query object for which the kNNs should be determined @@ -193,14 +214,14 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte * distance without the query object */ private DBIDs getNearestNeighbors(Relation<V> relation, SimilarityQuery<V, D> simQ, DBIDRef queryObject) { - // similarityFunction.getPreprocessor().getParameters(); Heap<DoubleDBIDPair> nearestNeighbors = new TiedTopBoundedHeap<>(knn); for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) { - if (!DBIDUtil.equal(iter, queryObject)) { - double sim = simQ.similarity(queryObject, iter).doubleValue(); - if (sim > 0) { - nearestNeighbors.add(DBIDUtil.newPair(sim, iter)); - } + if (DBIDUtil.equal(iter, queryObject)) { + continue; + } + double sim = simQ.similarity(queryObject, iter).doubleValue(); + if (sim > 0.) { + nearestNeighbors.add(DBIDUtil.newPair(sim, iter)); } } // Collect DBIDs @@ -211,6 +232,50 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte return dbids; } + /** + * Compute the per-dimension variances for the given neighborhood and center. + * + * @param relation Data relation + * @param center Center vector + * @param neighborhood Neighbors + * @return Per-dimension variances. + */ + private static double[] computePerDimensionVariances(Relation<? extends NumberVector<?>> relation, Vector center, DBIDs neighborhood) { + double[] c = center.getArrayRef(); + double[] variances = new double[c.length]; + for (DBIDIter iter = neighborhood.iter(); iter.valid(); iter.advance()) { + NumberVector<?> databaseObject = relation.get(iter); + for (int d = 0; d < c.length; d++) { + final double deviation = databaseObject.doubleValue(d) - c[d]; + variances[d] += deviation * deviation; + } + } + for (int d = 0; d < variances.length; d++) { + variances[d] /= neighborhood.size(); + } + return variances; + } + + /** + * Compute SOD score. + * + * @param queryObject Query object + * @param center Center vector + * @param weightVector Weight vector + * @return sod score + */ + private double subspaceOutlierDegree(V queryObject, Vector center, BitSet weightVector) { + final int card = weightVector.cardinality(); + if (card == 0) { + return 0; + } + final SubspaceEuclideanDistanceFunction df = new SubspaceEuclideanDistanceFunction(weightVector); + double distance = df.distance(queryObject, center).doubleValue(); + distance /= card; // FIXME: defined as card, should be sqrt(card), + // unfortunately + return distance; + } + @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); @@ -225,232 +290,89 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte * SOD Model class * * @author Arthur Zimek - * @param <V> the type of DatabaseObjects handled by this Result */ - // TODO: arthur comment - public static class SODModel<V extends NumberVector<?>> implements TextWriteable, Comparable<SODModel<?>> { - private double[] centerValues; - - private V center; - - private double[] variances; - - private double expectationOfVariance; - - private BitSet weightVector; - - private double sod; - + public static class SODModel implements TextWriteable { /** - * Initialize SOD Model - * - * @param relation Database - * @param neighborhood Neighborhood - * @param alpha Alpha value - * @param queryObject Query object + * Center vector */ - public SODModel(Relation<V> relation, DBIDs neighborhood, double alpha, V queryObject) { - if (neighborhood.size() > 0) { - // TODO: store database link? - centerValues = new double[RelationUtil.dimensionality(relation)]; - variances = new double[centerValues.length]; - for (DBIDIter iter = neighborhood.iter(); iter.valid(); iter.advance()) { - V databaseObject = relation.get(iter); - for (int d = 0; d < centerValues.length; d++) { - centerValues[d] += databaseObject.doubleValue(d); - } - } - for (int d = 0; d < centerValues.length; d++) { - centerValues[d] /= neighborhood.size(); - } - for (DBIDIter iter = neighborhood.iter(); iter.valid(); iter.advance()) { - V databaseObject = relation.get(iter); - for (int d = 0; d < centerValues.length; d++) { - // distance - double distance = centerValues[d] - databaseObject.doubleValue(d); - // variance - variances[d] += distance * distance; - } - } - expectationOfVariance = 0; - for (int d = 0; d < variances.length; d++) { - variances[d] /= neighborhood.size(); - expectationOfVariance += variances[d]; - } - expectationOfVariance /= variances.length; - weightVector = new BitSet(variances.length); - for (int d = 0; d < variances.length; d++) { - if (variances[d] < alpha * expectationOfVariance) { - weightVector.set(d, true); - } - } - center = RelationUtil.getNumberVectorFactory(relation).newNumberVector(centerValues); - sod = subspaceOutlierDegree(queryObject, center, weightVector); - } else { - center = queryObject; - sod = 0.0; - } - } + private Vector center; /** - * Compute SOD score. - * - * @param queryObject Query object - * @param center Center vector - * @param weightVector Weight vector - * @return sod score + * Relevant dimensions. */ - private double subspaceOutlierDegree(V queryObject, V center, BitSet weightVector) { - final SubspaceEuclideanDistanceFunction df = new SubspaceEuclideanDistanceFunction(weightVector); - final int card = weightVector.cardinality(); - if (card == 0) { - return 0; - } - double distance = df.distance(queryObject, center).doubleValue(); - distance /= card; - return distance; - } + private BitSet weightVector; /** - * Return the SOD of the point. + * Initialize SOD Model * - * @return sod value + * @param center Center vector + * @param weightVector Selected dimensions */ - public double getSod() { - return this.sod; + public SODModel(Vector center, BitSet weightVector) { + this.center = center; + this.weightVector = weightVector; } @Override public void writeToText(TextWriterStream out, String label) { - out.inlinePrint(label + "=" + this.sod); out.commentPrintLn(this.getClass().getSimpleName() + ":"); out.commentPrintLn("relevant attributes (counting starts with 0): " + this.weightVector.toString()); out.commentPrintLn("center of neighborhood: " + out.normalizationRestore(center).toString()); - out.commentPrintLn("subspace outlier degree: " + this.sod); out.commentPrintSeparator(); } - - @Override - public int compareTo(SODModel<?> o) { - return Double.compare(this.getSod(), o.getSod()); - } - } /** - * Proxy class that converts a model result to an actual SOD score result. + * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ - protected static class SODProxyScoreResult implements Relation<Double> { + public static class Parameterizer<V extends NumberVector<?>, D extends NumberDistance<D, ?>> extends AbstractParameterizer { /** - * Model result this is a proxy for. + * Parameter to specify the number of shared nearest neighbors to be + * considered for learning the subspace properties., must be an integer + * greater than 0. */ - Relation<SODModel<?>> models; + public static final OptionID KNN_ID = new OptionID("sod.knn", "The number of most snn-similar objects to use as reference set for learning the subspace properties."); /** - * The IDs we are defined for. + * Parameter to indicate the multiplier for the discriminance value for + * discerning small from large variances. */ - DBIDs dbids; + public static final OptionID ALPHA_ID = new OptionID("sod.alpha", "The multiplier for the discriminance value for discerning small from large variances."); /** - * Constructor. - * - * @param models Models result - * @param dbids IDs we are defined for + * Parameter for the similarity function. */ - public SODProxyScoreResult(Relation<SODModel<?>> models, DBIDs dbids) { - super(); - this.models = models; - this.dbids = dbids; - } - - @Override - public Double get(DBIDRef objID) { - return models.get(objID).getSod(); - } - - @Override - public String getLongName() { - return "Subspace Outlier Degree"; - } - - @Override - public String getShortName() { - return "sod-outlier"; - } - - @Override - public DBIDs getDBIDs() { - return dbids; - } - - @Override - public DBIDIter iterDBIDs() { - return dbids.iter(); - } - - @Override - public Database getDatabase() { - return null; // FIXME - } + public static final OptionID SIM_ID = new OptionID("sod.similarity", "The similarity function used for the neighborhood set."); - @Override - public void set(DBIDRef id, Double val) { - throw new UnsupportedOperationException(); - } - - @Override - public void delete(DBIDRef id) { - throw new UnsupportedOperationException(); - } - - @Override - public SimpleTypeInformation<Double> getDataTypeInformation() { - return TypeUtil.DOUBLE; - } - - @Override - public int size() { - return dbids.size(); - } - - @Override - public ResultHierarchy getHierarchy() { - return models.getHierarchy(); - } - - @Override - public void setHierarchy(ResultHierarchy hierarchy) { - models.setHierarchy(hierarchy); - } - } + /** + * Parameter for keeping the models. + */ + public static final OptionID MODELS_ID = new OptionID("sod.models", "Report the models computed by SOD (default: report only scores)."); - /** - * Parameterization class. - * - * @author Erich Schubert - * - * @apiviz.exclude - */ - public static class Parameterizer<V extends NumberVector<?>, D extends NumberDistance<D, ?>> extends AbstractParameterizer { /** - * Holds the value of {@link #KNN_ID}. + * Neighborhood size */ private int knn = 1; /** - * Holds the value of {@link #ALPHA_ID}. + * Alpha (discriminance value). */ private double alpha = 1.1; /** - * The similarity function - {@link #SIM_ID}. + * The similarity function. */ private SimilarityFunction<V, D> similarityFunction; + /** + * Track models. + */ + private boolean models = false; + @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); @@ -460,21 +382,26 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte } final IntParameter knnP = new IntParameter(KNN_ID); - knnP.addConstraint(new GreaterConstraint(0)); + knnP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if (config.grab(knnP)) { knn = knnP.getValue(); } final DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 1.1); - alphaP.addConstraint(new GreaterConstraint(0)); + alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); if (config.grab(alphaP)) { alpha = alphaP.doubleValue(); } + + final Flag modelsF = new Flag(MODELS_ID); + if (config.grab(modelsF)) { + models = modelsF.isTrue(); + } } @Override protected SOD<V, D> makeInstance() { - return new SOD<>(knn, alpha, similarityFunction); + return new SOD<>(knn, alpha, similarityFunction, models); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialAverageCoordinateOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialAverageCoordinateOutlier.java index 84e3ad41..6f2f2f38 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialAverageCoordinateOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialAverageCoordinateOutlier.java @@ -80,13 +80,13 @@ public class TrivialAverageCoordinateOutlier extends AbstractAlgorithm<OutlierRe m.reset(); NumberVector<?> nv = relation.get(iditer); for (int i = 0; i < nv.getDimensionality(); i++) { - m.put(nv.doubleValue(i + 1)); + m.put(nv.doubleValue(i)); } final double score = m.getMean(); scores.putDouble(iditer, score); minmax.put(score); } - Relation<Double> scoreres = new MaterializedRelation<Double>("Trivial mean score", "mean-outlier", TypeUtil.DOUBLE, scores, relation.getDBIDs()); + Relation<Double> scoreres = new MaterializedRelation<>("Trivial mean score", "mean-outlier", TypeUtil.DOUBLE, scores, relation.getDBIDs()); OutlierScoreMeta meta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax()); return new OutlierResult(meta, scoreres); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialGeneratedOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialGeneratedOutlier.java index 285b00df..2e952b5f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialGeneratedOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialGeneratedOutlier.java @@ -52,8 +52,7 @@ import de.lmu.ifi.dbs.elki.result.outlier.ProbabilisticOutlierScore; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; @@ -106,7 +105,8 @@ public class TrivialGeneratedOutlier extends AbstractAlgorithm<OutlierResult> im try { Relation<?> relation = database.getRelation(TypeUtil.CLASSLABEL); return run(models, vecs, relation); - } catch (NoSupportedDataTypeException e) { + } + catch(NoSupportedDataTypeException e) { // Otherwise, try any labellike. return run(models, vecs, database.getRelation(TypeUtil.GUESSED_LABEL)); } @@ -124,56 +124,58 @@ public class TrivialGeneratedOutlier extends AbstractAlgorithm<OutlierResult> im WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(models.getDBIDs(), DataStoreFactory.HINT_HOT); HashSet<GeneratorSingleCluster> generators = new HashSet<>(); - for (DBIDIter iditer = models.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = models.iterDBIDs(); iditer.valid(); iditer.advance()) { Model model = models.get(iditer); - if (model instanceof GeneratorSingleCluster) { + if(model instanceof GeneratorSingleCluster) { generators.add((GeneratorSingleCluster) model); } } - if (generators.size() == 0) { + if(generators.size() == 0) { LOG.warning("No generator models found for dataset - all points will be considered outliers."); } - for (GeneratorSingleCluster gen : generators) { - for (int i = 0; i < gen.getDim(); i++) { + for(GeneratorSingleCluster gen : generators) { + for(int i = 0; i < gen.getDim(); i++) { Distribution dist = gen.getDistribution(i); - if (!(dist instanceof NormalDistribution)) { + if(!(dist instanceof NormalDistribution)) { throw new AbortException("TrivialGeneratedOutlier currently only supports normal distributions, got: " + dist); } } } - for (DBIDIter iditer = models.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = models.iterDBIDs(); iditer.valid(); iditer.advance()) { double score = 1.; // Convert to a math vector Vector v = vecs.get(iditer).getColumnVector(); - for (GeneratorSingleCluster gen : generators) { + for(GeneratorSingleCluster gen : generators) { Vector tv = v; // Transform backwards - if (gen.getTransformation() != null) { + if(gen.getTransformation() != null) { tv = gen.getTransformation().applyInverse(v); } final int dim = tv.getDimensionality(); double lensq = 0.0; int norm = 0; - for (int i = 0; i < dim; i++) { + for(int i = 0; i < dim; i++) { Distribution dist = gen.getDistribution(i); - if (dist instanceof NormalDistribution) { + if(dist instanceof NormalDistribution) { NormalDistribution d = (NormalDistribution) dist; double delta = (tv.get(i) - d.getMean()) / d.getStddev(); lensq += delta * delta; norm += 1; - } else { + } + else { throw new AbortException("TrivialGeneratedOutlier currently only supports normal distributions, got: " + dist); } } - if (norm > 0.) { + if(norm > 0.) { // The squared distances are ChiSquared distributed score = Math.min(score, ChiSquaredDistribution.cdf(lensq, norm)); - } else { + } + else { score = 0.; } } - if (expect < 1) { + if(expect < 1) { score = expect * score / (1 - score + expect); } scores.putDouble(iditer, score); @@ -210,9 +212,9 @@ public class TrivialGeneratedOutlier extends AbstractAlgorithm<OutlierResult> im protected void makeOptions(Parameterization config) { super.makeOptions(config); DoubleParameter expectP = new DoubleParameter(EXPECT_ID, 0.01); - expectP.addConstraint(new GreaterConstraint(0.0)); - expectP.addConstraint(new LessEqualConstraint(1.0)); - if (config.grab(expectP)) { + expectP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); + expectP.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE); + if(config.grab(expectP)) { expect = expectP.getValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AddSingleScale.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AddSingleScale.java index cbae17ca..8bd5f057 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AddSingleScale.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AddSingleScale.java @@ -94,7 +94,11 @@ public class AddSingleScale implements Algorithm { for(DBIDIter iditer = rel.iterDBIDs(); iditer.valid(); iditer.advance()) { NumberVector<?> vec = rel.get(iditer); for(int d = 0; d < dim; d++) { - mm.put(vec.doubleValue(d)); + final double val = vec.doubleValue(d); + if(val != val) { + continue; // NaN + } + mm.put(val); } } LinearScale scale = new LinearScale(mm.getMin(), mm.getMax()); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AveragePrecisionAtK.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AveragePrecisionAtK.java index 1b87a015..490f8ba6 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AveragePrecisionAtK.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AveragePrecisionAtK.java @@ -47,9 +47,7 @@ import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.math.MeanVariance; import de.lmu.ifi.dbs.elki.result.CollectionResult; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; @@ -124,34 +122,36 @@ public class AveragePrecisionAtK<V extends Object, D extends NumberDistance<D, ? MeanVariance[] mvs = MeanVariance.newArray(k); final DBIDs ids; - if (sampling < 1.0) { + if(sampling < 1.0) { int size = Math.max(1, (int) (sampling * relation.size())); ids = DBIDUtil.randomSample(relation.getDBIDs(), size, seed); - } else { + } + else { ids = relation.getDBIDs(); } - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("Processing points..."); } FiniteProgress objloop = LOG.isVerbose() ? new FiniteProgress("Computing nearest neighbors", ids.size(), LOG) : null; // sort neighbors - for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { KNNList<D> knn = knnQuery.getKNNForDBID(iter, qk); Object label = lrelation.get(iter); int positive = 0, i = 0; - for (DBIDIter ri = knn.iter(); i < k && ri.valid(); ri.advance()) { - if (!includeSelf && DBIDUtil.equal(iter, ri)) { + for(DBIDIter ri = knn.iter(); i < k && ri.valid(); ri.advance()) { + if(!includeSelf && DBIDUtil.equal(iter, ri)) { continue; } Object olabel = lrelation.get(ri); - if (label == null) { - if (olabel == null) { + if(label == null) { + if(olabel == null) { positive += 1; } - } else { - if (label.equals(olabel)) { + } + else { + if(label.equals(olabel)) { positive += 1; } } @@ -159,18 +159,18 @@ public class AveragePrecisionAtK<V extends Object, D extends NumberDistance<D, ? mvs[i].put(precision); i++; } - if (objloop != null) { + if(objloop != null) { objloop.incrementProcessed(LOG); } } - if (objloop != null) { + if(objloop != null) { objloop.ensureCompleted(LOG); } // Collections.sort(results); // Transform Histogram into a Double Vector array. Collection<DoubleVector> res = new ArrayList<>(k); - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { DoubleVector row = new DoubleVector(new double[] { mvs[i].getMean(), mvs[i].getSampleStddev() }); res.add(row); } @@ -239,24 +239,24 @@ public class AveragePrecisionAtK<V extends Object, D extends NumberDistance<D, ? protected void makeOptions(Parameterization config) { super.makeOptions(config); final IntParameter kP = new IntParameter(K_ID); - kP.addConstraint(new GreaterEqualConstraint(2)); - if (config.grab(kP)) { + kP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); + if(config.grab(kP)) { k = kP.getValue(); } final DoubleParameter samplingP = new DoubleParameter(SAMPLING_ID); - samplingP.addConstraint(new GreaterConstraint(0.0)); - samplingP.addConstraint(new LessEqualConstraint(1.0)); + samplingP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); + samplingP.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE); samplingP.setOptional(true); - if (config.grab(samplingP)) { + if(config.grab(samplingP)) { sampling = samplingP.getValue(); } final LongParameter rndP = new LongParameter(SEED_ID); rndP.setOptional(true); - if (config.grab(rndP)) { + if(config.grab(rndP)) { seed = rndP.getValue(); } final Flag includeP = new Flag(INCLUDESELF_ID); - if (config.grab(includeP)) { + if(config.grab(includeP)) { includeSelf = includeP.isTrue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java index 3c8e1635..244af0ca 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java @@ -62,7 +62,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.OnlyOneIsAllowedToBeSetGlobalConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; @@ -159,24 +159,26 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex MeanVariance modif = new MeanVariance(); // Histogram final ObjHistogram<long[]> histogram; - if (stepprog != null) { + if(stepprog != null) { stepprog.beginStep(1, "Prepare histogram.", LOG); } - if (exact) { + if(exact) { gminmax = exactMinMax(relation, distFunc); histogram = new LongArrayStaticHistogram(numbin, gminmax.getMin(), gminmax.getMax(), 2); - } else if (sampling) { + } + else if(sampling) { gminmax = sampleMinMax(relation, distFunc); histogram = new LongArrayStaticHistogram(numbin, gminmax.getMin(), gminmax.getMax(), 2); - } else { + } + else { histogram = new AbstractObjDynamicHistogram<long[]>(numbin) { @Override protected long[] downsample(Object[] data, int start, int end, int size) { long[] ret = new long[2]; - for (int i = start; i < end; i++) { + for(int i = start; i < end; i++) { long[] existing = (long[]) data[i]; - if (existing != null) { - for (int c = 0; c < 2; c++) { + if(existing != null) { + for(int c = 0; c < 2; c++) { ret[c] += existing[c]; } } @@ -186,7 +188,7 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex @Override protected long[] aggregate(long[] first, long[] second) { - for (int c = 0; c < 2; c++) { + for(int c = 0; c < 2; c++) { first[c] += second[c]; } return first; @@ -204,20 +206,20 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex }; } - if (stepprog != null) { + if(stepprog != null) { stepprog.beginStep(2, "Build histogram.", LOG); } final FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Distance computations", relation.size(), LOG) : null; // iterate per cluster final long[] incFirst = new long[] { 1L, 0L }; final long[] incSecond = new long[] { 0L, 1L }; - for (Cluster<?> c1 : split) { - for (DBIDIter id1 = c1.getIDs().iter(); id1.valid(); id1.advance()) { + for(Cluster<?> c1 : split) { + for(DBIDIter id1 = c1.getIDs().iter(); id1.valid(); id1.advance()) { // in-cluster distances DoubleMinMax iminmax = new DoubleMinMax(); - for (DBIDIter iter2 = c1.getIDs().iter(); iter2.valid(); iter2.advance()) { + for(DBIDIter iter2 = c1.getIDs().iter(); iter2.valid(); iter2.advance()) { // skip the point itself. - if (DBIDUtil.equal(id1, iter2)) { + if(DBIDUtil.equal(id1, iter2)) { continue; } double d = distFunc.distance(id1, iter2).doubleValue(); @@ -236,13 +238,13 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex // other-cluster distances DoubleMinMax ominmax = new DoubleMinMax(); - for (Cluster<?> c2 : split) { - if (c2 == c1) { + for(Cluster<?> c2 : split) { + if(c2 == c1) { continue; } - for (DBIDIter iter2 = c2.getIDs().iter(); iter2.valid(); iter2.advance()) { + for(DBIDIter iter2 = c2.getIDs().iter(); iter2.valid(); iter2.advance()) { // skip the point itself (shouldn't happen though) - if (DBIDUtil.equal(id1, iter2)) { + if(DBIDUtil.equal(id1, iter2)) { continue; } double d = distFunc.distance(id1, iter2).doubleValue(); @@ -259,33 +261,33 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex // min/max gominmax.put(ominmax.getMin()); gominmax.put(ominmax.getMax()); - if (progress != null) { + if(progress != null) { progress.incrementProcessed(LOG); } } } - if (progress != null) { + if(progress != null) { progress.ensureCompleted(LOG); } // Update values (only needed for sampling case). gminmax.setFirst(Math.min(giminmax.getMin(), gominmax.getMin())); gminmax.setSecond(Math.max(giminmax.getMax(), gominmax.getMax())); - if (stepprog != null) { + if(stepprog != null) { stepprog.setCompleted(LOG); } // count the number of samples we have in the data long inum = 0; long onum = 0; - for (ObjHistogram.Iter<long[]> iter = histogram.iter(); iter.valid(); iter.advance()) { + for(ObjHistogram.Iter<long[]> iter = histogram.iter(); iter.valid(); iter.advance()) { inum += iter.getValue()[0]; onum += iter.getValue()[1]; } long bnum = inum + onum; Collection<DoubleVector> binstat = new ArrayList<>(numbin); - for (ObjHistogram.Iter<long[]> iter = histogram.iter(); iter.valid(); iter.advance()) { + for(ObjHistogram.Iter<long[]> iter = histogram.iter(); iter.valid(); iter.advance()) { final long[] value = iter.getValue(); final double icof = (inum == 0) ? 0 : ((double) value[0]) / inum / histogram.getBinsize(); final double icaf = ((double) value[0]) / bnum / histogram.getBinsize(); @@ -327,26 +329,26 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex ArrayModifiableDBIDs randomset = DBIDUtil.newArray(randomsize); DBIDIter iter = relation.iterDBIDs(); - if (!iter.valid()) { + if(!iter.valid()) { throw new IllegalStateException(ExceptionMessages.DATABASE_EMPTY); } DBID firstid = DBIDUtil.deref(iter); iter.advance(); minhotset.add(DBIDUtil.newPair(Double.MAX_VALUE, firstid)); maxhotset.add(DBIDUtil.newPair(Double.MIN_VALUE, firstid)); - for (; iter.valid(); iter.advance()) { + for(; iter.valid(); iter.advance()) { // generate candidates for min distance. ArrayList<DoubleDBIDPair> np = new ArrayList<>(k * 2 + randomsize * 2); - for (DoubleDBIDPair pair : minhotset) { + for(DoubleDBIDPair pair : minhotset) { // skip the object itself - if (DBIDUtil.equal(iter, pair)) { + if(DBIDUtil.equal(iter, pair)) { continue; } double d = distFunc.distance(iter, pair).doubleValue(); np.add(DBIDUtil.newPair(d, iter)); np.add(DBIDUtil.newPair(d, pair)); } - for (DBIDIter iter2 = randomset.iter(); iter2.valid(); iter2.advance()) { + for(DBIDIter iter2 = randomset.iter(); iter2.valid(); iter2.advance()) { double d = distFunc.distance(iter, iter2).doubleValue(); np.add(DBIDUtil.newPair(d, iter)); np.add(DBIDUtil.newPair(d, iter2)); @@ -356,16 +358,16 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex // generate candidates for max distance. ArrayList<DoubleDBIDPair> np2 = new ArrayList<>(k * 2 + randomsize * 2); - for (DoubleDBIDPair pair : minhotset) { + for(DoubleDBIDPair pair : minhotset) { // skip the object itself - if (DBIDUtil.equal(iter, pair)) { + if(DBIDUtil.equal(iter, pair)) { continue; } double d = distFunc.distance(iter, pair).doubleValue(); np2.add(DBIDUtil.newPair(d, iter)); np2.add(DBIDUtil.newPair(d, pair)); } - for (DBIDIter iter2 = randomset.iter(); iter2.valid(); iter2.advance()) { + for(DBIDIter iter2 = randomset.iter(); iter2.valid(); iter2.advance()) { double d = distFunc.distance(iter, iter2).doubleValue(); np.add(DBIDUtil.newPair(d, iter)); np.add(DBIDUtil.newPair(d, iter2)); @@ -374,9 +376,10 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex shrinkHeap(maxhotset, k); // update random set - if (randomset.size() < randomsize) { + if(randomset.size() < randomsize) { randomset.add(iter); - } else if (rnd.nextDouble() < rprob) { + } + else if(rnd.nextDouble() < rprob) { randomset.set((int) Math.floor(rnd.nextDouble() * randomsize), iter); } } @@ -393,10 +396,10 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex private DoubleMinMax exactMinMax(Relation<O> relation, DistanceQuery<O, D> distFunc) { DoubleMinMax minmax = new DoubleMinMax(); // find exact minimum and maximum first. - for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { - for (DBIDIter iditer2 = relation.iterDBIDs(); iditer2.valid(); iditer2.advance()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer2 = relation.iterDBIDs(); iditer2.valid(); iditer2.advance()) { // skip the point itself. - if (DBIDUtil.equal(iditer, iditer2)) { + if(DBIDUtil.equal(iditer, iditer2)) { continue; } double d = distFunc.distance(iditer, iditer2).doubleValue(); @@ -416,11 +419,12 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex // drop duplicates ModifiableDBIDs seenids = DBIDUtil.newHashSet(2 * k); int cnt = 0; - for (Iterator<DoubleDBIDPair> i = hotset.iterator(); i.hasNext();) { + for(Iterator<DoubleDBIDPair> i = hotset.iterator(); i.hasNext();) { DoubleDBIDPair p = i.next(); - if (cnt > k || seenids.contains(p)) { + if(cnt > k || seenids.contains(p)) { i.remove(); - } else { + } + else { seenids.add(p); cnt++; } @@ -464,18 +468,18 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex protected void makeOptions(Parameterization config) { super.makeOptions(config); final IntParameter numbinP = new IntParameter(HISTOGRAM_BINS_ID, 20); - numbinP.addConstraint(new GreaterEqualConstraint(2)); - if (config.grab(numbinP)) { + numbinP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); + if(config.grab(numbinP)) { numbin = numbinP.getValue(); } final Flag exactF = new Flag(EXACT_ID); - if (config.grab(exactF)) { + if(config.grab(exactF)) { exact = exactF.getValue(); } final Flag samplingF = new Flag(SAMPLING_ID); - if (config.grab(samplingF)) { + if(config.grab(samplingF)) { sampling = samplingF.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java index 76e5ef66..d5d8e407 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java @@ -62,7 +62,7 @@ import de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.ObjHistogram; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -121,7 +121,7 @@ public class EvaluateRankingQuality<V extends NumberVector<?>, D extends NumberD final DistanceQuery<V, D> distQuery = database.getDistanceQuery(relation, getDistanceFunction()); final KNNQuery<V, D> knnQuery = database.getKNNQuery(distQuery, relation.size()); - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("Preprocessing clusters..."); } // Cluster by labels @@ -130,7 +130,7 @@ public class EvaluateRankingQuality<V extends NumberVector<?>, D extends NumberD // Compute cluster averages and covariance matrix HashMap<Cluster<?>, Vector> averages = new HashMap<>(split.size()); HashMap<Cluster<?>, Matrix> covmats = new HashMap<>(split.size()); - for (Cluster<?> clus : split) { + for(Cluster<?> clus : split) { CovarianceMatrix covmat = CovarianceMatrix.make(relation, clus.getIDs()); averages.put(clus, covmat.getMeanVector()); covmats.put(clus, covmat.destroyToNaiveMatrix()); @@ -138,42 +138,42 @@ public class EvaluateRankingQuality<V extends NumberVector<?>, D extends NumberD MeanVarianceStaticHistogram hist = new MeanVarianceStaticHistogram(numbins, 0.0, 1.0); - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("Processing points..."); } FiniteProgress rocloop = LOG.isVerbose() ? new FiniteProgress("Computing ROC AUC values", relation.size(), LOG) : null; // sort neighbors - for (Cluster<?> clus : split) { + for(Cluster<?> clus : split) { ArrayList<DoubleDBIDPair> cmem = new ArrayList<>(clus.size()); Vector av = averages.get(clus); Matrix covm = covmats.get(clus); - for (DBIDIter iter = clus.getIDs().iter(); iter.valid(); iter.advance()) { + for(DBIDIter iter = clus.getIDs().iter(); iter.valid(); iter.advance()) { double d = MathUtil.mahalanobisDistance(covm, relation.get(iter).getColumnVector().minusEquals(av)); cmem.add(DBIDUtil.newPair(d, iter)); } Collections.sort(cmem); - for (int ind = 0; ind < cmem.size(); ind++) { + for(int ind = 0; ind < cmem.size(); ind++) { KNNList<D> knn = knnQuery.getKNNForDBID(cmem.get(ind), relation.size()); double result = ROC.computeROCAUCDistanceResult(relation.size(), clus, knn); hist.put(((double) ind) / clus.size(), result); - if (rocloop != null) { + if(rocloop != null) { rocloop.incrementProcessed(LOG); } } } - if (rocloop != null) { + if(rocloop != null) { rocloop.ensureCompleted(LOG); } // Collections.sort(results); // Transform Histogram into a Double Vector array. Collection<DoubleVector> res = new ArrayList<>(relation.size()); - for (ObjHistogram.Iter<MeanVariance> iter = hist.iter(); iter.valid(); iter.advance()) { + for(ObjHistogram.Iter<MeanVariance> iter = hist.iter(); iter.valid(); iter.advance()) { DoubleVector row = new DoubleVector(new double[] { iter.getCenter(), iter.getValue().getCount(), iter.getValue().getMean(), iter.getValue().getSampleVariance() }); res.add(row); } @@ -207,8 +207,8 @@ public class EvaluateRankingQuality<V extends NumberVector<?>, D extends NumberD protected void makeOptions(Parameterization config) { super.makeOptions(config); final IntParameter param = new IntParameter(HISTOGRAM_BINS_ID, 20); - param.addConstraint(new GreaterEqualConstraint(2)); - if (config.grab(param)) { + param.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); + if(config.grab(param)) { numbins = param.getValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java index 58018029..7d0f1bb2 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java @@ -51,7 +51,7 @@ import de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.DoubleStaticHistog import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -109,7 +109,7 @@ public class RankingQualityHistogram<O, D extends NumberDistance<D, ?>> extends final DistanceQuery<O, D> distanceQuery = database.getDistanceQuery(relation, getDistanceFunction()); final KNNQuery<O, D> knnQuery = database.getKNNQuery(distanceQuery, relation.size()); - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("Preprocessing clusters..."); } // Cluster by labels @@ -117,33 +117,33 @@ public class RankingQualityHistogram<O, D extends NumberDistance<D, ?>> extends DoubleStaticHistogram hist = new DoubleStaticHistogram(numbins, 0.0, 1.0); - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("Processing points..."); } FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Computing ROC AUC values", relation.size(), LOG) : null; MeanVariance mv = new MeanVariance(); // sort neighbors - for (Cluster<?> clus : split) { - for (DBIDIter iter = clus.getIDs().iter(); iter.valid(); iter.advance()) { + for(Cluster<?> clus : split) { + for(DBIDIter iter = clus.getIDs().iter(); iter.valid(); iter.advance()) { KNNList<D> knn = knnQuery.getKNNForDBID(iter, relation.size()); double result = ROC.computeROCAUCDistanceResult(relation.size(), clus, knn); mv.put(result); hist.increment(result, 1. / relation.size()); - if (progress != null) { + if(progress != null) { progress.incrementProcessed(LOG); } } } - if (progress != null) { + if(progress != null) { progress.ensureCompleted(LOG); } // Transform Histogram into a Double Vector array. Collection<DoubleVector> res = new ArrayList<>(relation.size()); - for (DoubleStaticHistogram.Iter iter = hist.iter(); iter.valid(); iter.advance()) { + for(DoubleStaticHistogram.Iter iter = hist.iter(); iter.valid(); iter.advance()) { DoubleVector row = new DoubleVector(new double[] { iter.getCenter(), iter.getValue() }); res.add(row); } @@ -179,8 +179,8 @@ public class RankingQualityHistogram<O, D extends NumberDistance<D, ?>> extends protected void makeOptions(Parameterization config) { super.makeOptions(config); final IntParameter param = new IntParameter(HISTOGRAM_BINS_ID, 100); - param.addConstraint(new GreaterEqualConstraint(2)); - if (config.grab(param)) { + param.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); + if(config.grab(param)) { numbins = param.getValue(); } } |