diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm')
124 files changed, 4111 insertions, 1470 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/APRIORI.java b/src/de/lmu/ifi/dbs/elki/algorithm/APRIORI.java index 5cfb7073..fc346cd9 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/APRIORI.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/APRIORI.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/AbstractAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/AbstractAlgorithm.java index 61a695eb..30e6e226 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/AbstractAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/AbstractAlgorithm.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -126,6 +126,9 @@ public abstract class AbstractAlgorithm<R extends Result> implements Algorithm { if(e.getTargetException() instanceof RuntimeException) { throw (RuntimeException) e.getTargetException(); } + if(e.getTargetException() instanceof AssertionError) { + throw (AssertionError) e.getTargetException(); + } throw new APIViolationException("Invoking the real 'run' method failed: " + e.getTargetException().toString(), e.getTargetException()); } } @@ -147,6 +150,9 @@ public abstract class AbstractAlgorithm<R extends Result> implements Algorithm { if(e.getTargetException() instanceof RuntimeException) { throw (RuntimeException) e.getTargetException(); } + if(e.getTargetException() instanceof AssertionError) { + throw (AssertionError) e.getTargetException(); + } throw new APIViolationException("Invoking the real 'run' method failed: " + e.getTargetException().toString(), e.getTargetException()); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/AbstractDistanceBasedAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/AbstractDistanceBasedAlgorithm.java index 394ea55b..70d4ba3a 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/AbstractDistanceBasedAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/AbstractDistanceBasedAlgorithm.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/AbstractPrimitiveDistanceBasedAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/AbstractPrimitiveDistanceBasedAlgorithm.java index ad49563a..4fa12e11 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/AbstractPrimitiveDistanceBasedAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/AbstractPrimitiveDistanceBasedAlgorithm.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/Algorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/Algorithm.java index 40f24914..7c6f0dc5 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/Algorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/Algorithm.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/DependencyDerivator.java b/src/de/lmu/ifi/dbs/elki/algorithm/DependencyDerivator.java index 86fc3e51..0ecfb228 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/DependencyDerivator.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/DependencyDerivator.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -24,7 +24,6 @@ package de.lmu.ifi.dbs.elki.algorithm; */ import java.text.NumberFormat; -import java.util.List; import java.util.Locale; import de.lmu.ifi.dbs.elki.data.NumberVector; @@ -34,9 +33,8 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; -import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; @@ -97,7 +95,7 @@ public class DependencyDerivator<V extends NumberVector<V, ?>, D extends Distanc public static final OptionID OUTPUT_ACCURACY_ID = OptionID.getOrCreateOptionID("derivator.accuracy", "Threshold for output accuracy fraction digits."); /** - * Optional parameter to specify the treshold for the size of the random + * Optional parameter to specify the threshold for the size of the random * sample to use, must be an integer greater than 0. * <p/> * Default value: the size of the complete dataset @@ -159,17 +157,12 @@ public class DependencyDerivator<V extends NumberVector<V, ?>, D extends Distanc DBIDs ids; if(this.sampleSize > 0) { if(randomsample) { - ids = DBIDUtil.randomSample(relation.getDBIDs(), this.sampleSize, 1); + ids = DBIDUtil.randomSample(relation.getDBIDs(), this.sampleSize, 1l); } else { DistanceQuery<V, D> distanceQuery = database.getDistanceQuery(relation, getDistanceFunction()); - List<DistanceResultPair<D>> queryResults = database.getKNNQuery(distanceQuery, this.sampleSize).getKNNForObject(centroidDV, this.sampleSize); - ModifiableDBIDs tids = DBIDUtil.newHashSet(this.sampleSize); - for(DistanceResultPair<D> qr : queryResults) { - tids.add(qr.getDBID()); - } - // Cast to non-modifiable - ids = tids; + KNNResult<D> queryResults = database.getKNNQuery(distanceQuery, this.sampleSize).getKNNForObject(centroidDV, this.sampleSize); + ids = DBIDUtil.newHashSet(queryResults.asDBIDs()); } } else { @@ -243,7 +236,7 @@ public class DependencyDerivator<V extends NumberVector<V, ?>, D extends Distanc // +1 == + B.getColumnDimensionality() Matrix gaussJordan = new Matrix(transposedWeakEigenvectors.getRowDimensionality(), transposedWeakEigenvectors.getColumnDimensionality() + 1); gaussJordan.setMatrix(0, transposedWeakEigenvectors.getRowDimensionality() - 1, 0, transposedWeakEigenvectors.getColumnDimensionality() - 1, transposedWeakEigenvectors); - gaussJordan.setColumnVector(transposedWeakEigenvectors.getColumnDimensionality(), B); + gaussJordan.setCol(transposedWeakEigenvectors.getColumnDimensionality(), B); if(logger.isDebuggingFiner()) { logger.debugFiner("Gauss-Jordan-Elimination of " + FormatUtil.format(gaussJordan, NF)); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/DummyAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/DummyAlgorithm.java index e403c623..168c69f1 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/DummyAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/DummyAlgorithm.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -39,12 +39,12 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; /** - * Dummy Algorithm, which just iterates over all points once, doing a 10NN query + * Dummy algorithm, which just iterates over all points once, doing a 10NN query * each. Useful in testing e.g. index structures and as template for custom - * algorithms. If you are looking for an algorithm that does <em>nothing</em>, + * algorithms. While this algorithm doesn't produce a result, it + * still performs rather expensive operations. If you are looking for an algorithm that does <em>nothing</em>, * you must use {@link de.lmu.ifi.dbs.elki.algorithm.NullAlgorithm - * NullAlgorithm} instead. While this algorithm doesn't produce a result, it - * still performs rather expensive operations. + * NullAlgorithm} instead. * * @author Erich Schubert * @param <O> Vector type @@ -52,7 +52,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Title; * @apiviz.uses KNNQuery */ @Title("Dummy Algorithm") -@Description("The algorithm executes a euclidena 10NN query on all data points, and can be used in unit testing") +@Description("The algorithm executes an Euclidean 10NN query on all data points, and can be used in unit testing") public class DummyAlgorithm<O extends NumberVector<?, ?>> extends AbstractAlgorithm<Result> { /** * The logger for this class. diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/KNNDistanceOrder.java b/src/de/lmu/ifi/dbs/elki/algorithm/KNNDistanceOrder.java index 35065973..ac1820f9 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/KNNDistanceOrder.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/KNNDistanceOrder.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -32,9 +32,9 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBID; -import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; @@ -117,9 +117,8 @@ public class KNNDistanceOrder<O, D extends Distance<D>> extends AbstractDistance List<D> knnDistances = new ArrayList<D>(relation.size()); for(DBID id : relation.iterDBIDs()) { if(random.nextDouble() < percentage) { - final List<DistanceResultPair<D>> neighbors = knnQuery.getKNNForDBID(id, k); - final int last = Math.min(k - 1, neighbors.size() - 1); - knnDistances.add(neighbors.get(last).getDistance()); + final KNNResult<D> neighbors = knnQuery.getKNNForDBID(id, k); + knnDistances.add(neighbors.getKNNDistance()); } } Collections.sort(knnDistances, Collections.reverseOrder()); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/KNNJoin.java b/src/de/lmu/ifi/dbs/elki/algorithm/KNNJoin.java index 6dac350d..3cbfe143 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/KNNJoin.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/KNNJoin.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -37,20 +37,24 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; -import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.query.DoubleDistanceResultPair; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.DistanceUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.SpatialPrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.SpatialPrimitiveDoubleDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.index.tree.LeafEntry; import de.lmu.ifi.dbs.elki.index.tree.spatial.SpatialEntry; import de.lmu.ifi.dbs.elki.index.tree.spatial.SpatialIndexTree; import de.lmu.ifi.dbs.elki.index.tree.spatial.SpatialNode; +import de.lmu.ifi.dbs.elki.index.tree.spatial.SpatialPointLeafEntry; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.result.ResultUtil; +import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.Heap; import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.KNNHeap; import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.KNNList; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; @@ -66,7 +70,13 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; * This algorithm only supports spatial databases based on a spatial index * structure. * + * Since this method compares the MBR of every single leaf with every other + * leaf, it is essentially quadratic in the number of leaves, which may not be + * appropriate for large trees. + * * @author Elke Achtert + * @author Erich Schubert + * * @param <V> the type of FeatureVector handled by this Algorithm * @param <D> the type of Distance used by this Algorithm * @param <N> the type of node used in the spatial index structure @@ -110,7 +120,7 @@ public class KNNJoin<V extends NumberVector<V, ?>, D extends Distance<D>, N exte * {@link SpatialPrimitiveDistanceFunction}. */ @SuppressWarnings("unchecked") - public DataStore<KNNList<D>> run(Database database, Relation<V> relation) throws IllegalStateException { + public WritableDataStore<KNNList<D>> run(Database database, Relation<V> relation) throws IllegalStateException { if(!(getDistanceFunction() instanceof SpatialPrimitiveDistanceFunction)) { throw new IllegalStateException("Distance Function must be an instance of " + SpatialPrimitiveDistanceFunction.class.getName()); } @@ -121,117 +131,222 @@ public class KNNJoin<V extends NumberVector<V, ?>, D extends Distance<D>, N exte // FIXME: Ensure were looking at the right relation! SpatialIndexTree<N, E> index = indexes.iterator().next(); SpatialPrimitiveDistanceFunction<V, D> distFunction = (SpatialPrimitiveDistanceFunction<V, D>) getDistanceFunction(); - DistanceQuery<V, D> distq = database.getDistanceQuery(relation, distFunction); - DBIDs ids = relation.getDBIDs(); - WritableDataStore<KNNHeap<D>> knnHeaps = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, KNNHeap.class); + // Optimize for double? + final boolean doubleOptimize = (getDistanceFunction() instanceof SpatialPrimitiveDoubleDistanceFunction); - try { - // data pages of s - List<E> ps_candidates = index.getLeaves(); - FiniteProgress progress = logger.isVerbose() ? new FiniteProgress(this.getClass().getName(), relation.size(), logger) : null; - IndefiniteProgress pageprog = logger.isVerbose() ? new IndefiniteProgress("Number of processed data pages", logger) : null; - if(logger.isDebugging()) { - logger.debugFine("# ps = " + ps_candidates.size()); - } - // data pages of r - List<E> pr_candidates = new ArrayList<E>(ps_candidates); - if(logger.isDebugging()) { - logger.debugFine("# pr = " + pr_candidates.size()); - } - int processed = 0; - int processedPages = 0; - boolean up = true; - for(E pr_entry : pr_candidates) { - N pr = index.getNode(pr_entry); - D pr_knn_distance = distq.infiniteDistance(); - if(logger.isDebugging()) { - logger.debugFine(" ------ PR = " + pr); - } - // create for each data object a knn list - for(int j = 0; j < pr.getNumEntries(); j++) { - knnHeaps.put(((LeafEntry) pr.getEntry(j)).getDBID(), new KNNHeap<D>(k, distq.infiniteDistance())); - } + // data pages + List<E> ps_candidates = new ArrayList<E>(index.getLeaves()); + // knn heaps + List<List<KNNHeap<D>>> heaps = new ArrayList<List<KNNHeap<D>>>(ps_candidates.size()); + Heap<Task> pq = new Heap<Task>(ps_candidates.size() * ps_candidates.size() / 10); - if(up) { - for(E ps_entry : ps_candidates) { - D distance = distFunction.minDist(pr_entry, ps_entry); + // Initialize with the page self-pairing + for(int i = 0; i < ps_candidates.size(); i++) { + E pr_entry = ps_candidates.get(i); + N pr = index.getNode(pr_entry); + heaps.add(initHeaps(distFunction, doubleOptimize, pr)); + } - if(distance.compareTo(pr_knn_distance) <= 0) { - N ps = index.getNode(ps_entry); - pr_knn_distance = processDataPages(distq, pr, ps, knnHeaps, pr_knn_distance); - } - } - up = false; + // Build priority queue + final int sqsize = ps_candidates.size() * (ps_candidates.size() - 1) / 2; + if(logger.isDebuggingFine()) { + logger.debugFine("Number of leaves: " + ps_candidates.size() + " so " + sqsize + " MBR computations."); + } + FiniteProgress mprogress = logger.isVerbose() ? new FiniteProgress("Comparing leaf MBRs", sqsize, logger) : null; + for(int i = 0; i < ps_candidates.size(); i++) { + E pr_entry = ps_candidates.get(i); + List<KNNHeap<D>> pr_heaps = heaps.get(i); + D pr_knn_distance = computeStopDistance(pr_heaps); + + for(int j = i + 1; j < ps_candidates.size(); j++) { + E ps_entry = ps_candidates.get(j); + List<KNNHeap<D>> ps_heaps = heaps.get(j); + D ps_knn_distance = computeStopDistance(ps_heaps); + D minDist = distFunction.minDist(pr_entry, ps_entry); + // Resolve immediately: + if(minDist.isNullDistance()) { + N pr = index.getNode(ps_candidates.get(i)); + N ps = index.getNode(ps_candidates.get(j)); + processDataPagesOptimize(distFunction, doubleOptimize, pr_heaps, ps_heaps, pr, ps); + } + else if(minDist.compareTo(pr_knn_distance) <= 0 || minDist.compareTo(ps_knn_distance) <= 0) { + pq.add(new Task(minDist, i, j)); } + if(mprogress != null) { + mprogress.incrementProcessed(logger); + } + } + } + if(mprogress != null) { + mprogress.ensureCompleted(logger); + } + // Process the queue + FiniteProgress qprogress = logger.isVerbose() ? new FiniteProgress("Processing queue", pq.size(), logger) : null; + IndefiniteProgress fprogress = logger.isVerbose() ? new IndefiniteProgress("Full comparisons", logger) : null; + while(!pq.isEmpty()) { + Task task = pq.poll(); + List<KNNHeap<D>> pr_heaps = heaps.get(task.i); + List<KNNHeap<D>> ps_heaps = heaps.get(task.j); + D pr_knn_distance = computeStopDistance(pr_heaps); + D ps_knn_distance = computeStopDistance(ps_heaps); + boolean dor = task.mindist.compareTo(pr_knn_distance) <= 0; + boolean dos = task.mindist.compareTo(ps_knn_distance) <= 0; + if(dor || dos) { + N pr = index.getNode(ps_candidates.get(task.i)); + N ps = index.getNode(ps_candidates.get(task.j)); + if(dor && dos) { + processDataPagesOptimize(distFunction, doubleOptimize, pr_heaps, ps_heaps, pr, ps); + } else { - for(int s = ps_candidates.size() - 1; s >= 0; s--) { - E ps_entry = ps_candidates.get(s); - D distance = distFunction.minDist(pr_entry, ps_entry); - - if(distance.compareTo(pr_knn_distance) <= 0) { - N ps = index.getNode(ps_entry); - pr_knn_distance = processDataPages(distq, pr, ps, knnHeaps, pr_knn_distance); - } + if(dor) { + processDataPagesOptimize(distFunction, doubleOptimize, pr_heaps, null, pr, ps); + } + else /* dos */{ + processDataPagesOptimize(distFunction, doubleOptimize, ps_heaps, null, ps, pr); } - up = true; } + if(fprogress != null) { + fprogress.incrementProcessed(logger); + } + } + if(qprogress != null) { + qprogress.incrementProcessed(logger); + } + } + if(qprogress != null) { + qprogress.ensureCompleted(logger); + } + if(fprogress != null) { + fprogress.setCompleted(logger); + } - processed += pr.getNumEntries(); + WritableDataStore<KNNList<D>> knnLists = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_STATIC, KNNList.class); + // FiniteProgress progress = logger.isVerbose() ? new + // FiniteProgress(this.getClass().getName(), relation.size(), logger) : + // null; + FiniteProgress pageprog = logger.isVerbose() ? new FiniteProgress("Number of processed data pages", ps_candidates.size(), logger) : null; + // int processed = 0; + for(int i = 0; i < ps_candidates.size(); i++) { + N pr = index.getNode(ps_candidates.get(i)); + List<KNNHeap<D>> pr_heaps = heaps.get(i); - if(progress != null && pageprog != null) { - progress.setProcessed(processed, logger); - pageprog.setProcessed(processedPages++, logger); - } + // Finalize lists + for(int j = 0; j < pr.getNumEntries(); j++) { + knnLists.put(((LeafEntry) pr.getEntry(j)).getDBID(), pr_heaps.get(j).toKNNList()); } + // Forget heaps and pq + heaps.set(i, null); + // processed += pr.getNumEntries(); + + // if(progress != null) { + // progress.setProcessed(processed, logger); + // } if(pageprog != null) { - pageprog.setCompleted(logger); - } - WritableDataStore<KNNList<D>> knnLists = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_STATIC, KNNList.class); - for(DBID id : ids) { - knnLists.put(id, knnHeaps.get(id).toKNNList()); + pageprog.incrementProcessed(logger); } - return knnLists; } + // if(progress != null) { + // progress.ensureCompleted(logger); + // } + if(pageprog != null) { + pageprog.ensureCompleted(logger); + } + return knnLists; + } - catch(Exception e) { - throw new IllegalStateException(e); + private List<KNNHeap<D>> initHeaps(SpatialPrimitiveDistanceFunction<V, D> distFunction, final boolean doubleOptimize, N pr) { + List<KNNHeap<D>> pr_heaps; + // Create for each data object a knn heap + pr_heaps = new ArrayList<KNNHeap<D>>(pr.getNumEntries()); + for(int j = 0; j < pr.getNumEntries(); j++) { + pr_heaps.add(new KNNHeap<D>(k, distFunction.getDistanceFactory().infiniteDistance())); } + // Self-join first, as this is expected to improve most and cannot be + // pruned. + processDataPagesOptimize(distFunction, doubleOptimize, pr_heaps, null, pr, pr); + return pr_heaps; } /** * Processes the two data pages pr and ps and determines the k-nearest * neighbors of pr in ps. * - * @param distQ the distance to use + * @param distFunction the distance to use + * @param doubleOptimize Flag whether to optimize for doubles. * @param pr the first data page * @param ps the second data page - * @param knnLists the knn lists for each data object - * @param pr_knn_distance the current knn distance of data page pr - * @return the k-nearest neighbor distance of pr in ps + * @param pr_heaps the knn lists for each data object in pr + * @param ps_heaps the knn lists for each data object in ps (if ps != pr) */ - private D processDataPages(DistanceQuery<V, D> distQ, N pr, N ps, WritableDataStore<KNNHeap<D>> knnLists, D pr_knn_distance) { - // TODO: optimize for double? - boolean infinite = pr_knn_distance.isInfiniteDistance(); - for(int i = 0; i < pr.getNumEntries(); i++) { - DBID r_id = ((LeafEntry) pr.getEntry(i)).getDBID(); - KNNHeap<D> knnList = knnLists.get(r_id); - + private void processDataPagesOptimize(SpatialPrimitiveDistanceFunction<V, D> distFunction, final boolean doubleOptimize, List<KNNHeap<D>> pr_heaps, List<KNNHeap<D>> ps_heaps, N pr, N ps) { + if(doubleOptimize) { + List<?> khp = (List<?>) pr_heaps; + List<?> khs = (List<?>) ps_heaps; + processDataPagesDouble((SpatialPrimitiveDoubleDistanceFunction<? super V>) distFunction, pr, ps, (List<KNNHeap<DoubleDistance>>) khp, (List<KNNHeap<DoubleDistance>>) khs); + } + else { for(int j = 0; j < ps.getNumEntries(); j++) { - DBID s_id = ((LeafEntry) ps.getEntry(j)).getDBID(); - - D distance = distQ.distance(r_id, s_id); - if(knnList.add(distance, s_id)) { - // set kNN distance of r - if(infinite) { - pr_knn_distance = knnList.getMaximumDistance(); + final SpatialPointLeafEntry s_e = (SpatialPointLeafEntry) ps.getEntry(j); + DBID s_id = s_e.getDBID(); + for(int i = 0; i < pr.getNumEntries(); i++) { + final SpatialPointLeafEntry r_e = (SpatialPointLeafEntry) pr.getEntry(i); + D distance = distFunction.minDist(s_e, r_e); + pr_heaps.get(i).add(distance, s_id); + if(pr != ps && ps_heaps != null) { + ps_heaps.get(j).add(distance, r_e.getDBID()); } - pr_knn_distance = DistanceUtil.max(knnList.getMaximumDistance(), pr_knn_distance); } } } + } + + /** + * Processes the two data pages pr and ps and determines the k-nearest + * neighbors of pr in ps. + * + * @param df the distance function to use + * @param pr the first data page + * @param ps the second data page + * @param pr_heaps the knn lists for each data object + * @param ps_heaps the knn lists for each data object in ps + */ + private void processDataPagesDouble(SpatialPrimitiveDoubleDistanceFunction<? super V> df, N pr, N ps, List<KNNHeap<DoubleDistance>> pr_heaps, List<KNNHeap<DoubleDistance>> ps_heaps) { + // Compare pairwise + for(int j = 0; j < ps.getNumEntries(); j++) { + final SpatialPointLeafEntry s_e = (SpatialPointLeafEntry) ps.getEntry(j); + DBID s_id = s_e.getDBID(); + for(int i = 0; i < pr.getNumEntries(); i++) { + final SpatialPointLeafEntry r_e = (SpatialPointLeafEntry) pr.getEntry(i); + double distance = df.doubleMinDist(s_e, r_e); + pr_heaps.get(i).add(new DoubleDistanceResultPair(distance, s_id)); + if(pr != ps && ps_heaps != null) { + ps_heaps.get(j).add(new DoubleDistanceResultPair(distance, r_e.getDBID())); + } + } + } + } + + /** + * Compute the maximum stop distance + * + * @param heaps + * @return the k-nearest neighbor distance of pr in ps + */ + private D computeStopDistance(List<KNNHeap<D>> heaps) { + // Update pruning distance + D pr_knn_distance = null; + for(KNNHeap<D> knnList : heaps) { + // set kNN distance of r + if(pr_knn_distance == null) { + pr_knn_distance = knnList.getKNNDistance(); + } + else { + pr_knn_distance = DistanceUtil.max(knnList.getKNNDistance(), pr_knn_distance); + } + } return pr_knn_distance; } @@ -246,6 +361,40 @@ public class KNNJoin<V extends NumberVector<V, ?>, D extends Distance<D>, N exte } /** + * Task in the processing queue + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + private class Task implements Comparable<Task> { + final D mindist; + + final int i; + + final int j; + + /** + * Constructor. + * + * @param mindist + * @param i + * @param j + */ + public Task(D mindist, int i, int j) { + super(); + this.mindist = mindist; + this.i = i; + this.j = j; + } + + @Override + public int compareTo(Task o) { + return mindist.compareTo(o.mindist); + } + } + + /** * Parameterization class. * * @author Erich Schubert diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/MaterializeDistances.java b/src/de/lmu/ifi/dbs/elki/algorithm/MaterializeDistances.java index 345fda1a..89d2d3e0 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/MaterializeDistances.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/MaterializeDistances.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/NullAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/NullAlgorithm.java index 6c13e68e..a879c6b2 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/NullAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/NullAlgorithm.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java index 5712d814..ea441655 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java index 95d88b93..108ba0ed 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -340,9 +340,9 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext } } - if(processedIDs.size() == distFunc.getRelation().size() && noise.size() == 0) { + /* if(processedIDs.size() == relation.size() && noise.size() == 0) { break; - } + } */ } if(currentCluster.size() >= minpts) { diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java index e28dbff3..5ec59777 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java index 2576c5f6..b59af555 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -33,7 +33,6 @@ import de.lmu.ifi.dbs.elki.data.model.ClusterModel; import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; -import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.QueryUtil; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; @@ -132,7 +131,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor /** * Performs the DBSCAN algorithm on the given database. */ - public Clustering<Model> run(Database database, Relation<O> relation) { + public Clustering<Model> run(Relation<O> relation) { RangeQuery<O, D> rangeQuery = QueryUtil.getRangeQuery(relation, getDistanceFunction()); final int size = relation.size(); @@ -142,9 +141,9 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor noise = DBIDUtil.newHashSet(); processedIDs = DBIDUtil.newHashSet(size); if(size >= minpts) { - for(DBID id : rangeQuery.getRelation().iterDBIDs()) { + for(DBID id : relation.iterDBIDs()) { if(!processedIDs.contains(id)) { - expandCluster(database, rangeQuery, id, objprog, clusprog); + expandCluster(relation, rangeQuery, id, objprog, clusprog); } if(objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), logger); @@ -156,7 +155,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor } } else { - for(DBID id : rangeQuery.getRelation().iterDBIDs()) { + for(DBID id : relation.iterDBIDs()) { noise.add(id); if(objprog != null && clusprog != null) { objprog.setProcessed(noise.size(), logger); @@ -189,12 +188,12 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor * <p/> * Border-Objects become members of the first possible cluster. * - * @param database the database on which the algorithm is run + * @param relation Database relation to run on * @param rangeQuery Range query to use * @param startObjectID potential seed of a new potential cluster * @param objprog the progress object for logging the current status */ - protected void expandCluster(Database database, RangeQuery<O, D> rangeQuery, DBID startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) { + protected void expandCluster(Relation<O> relation, RangeQuery<O, D> rangeQuery, DBID startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) { List<DistanceResultPair<D>> seeds = rangeQuery.getRangeForDBID(startObjectID, epsilon); // startObject is no core-object @@ -245,7 +244,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor } } - if(processedIDs.size() == rangeQuery.getRelation().size() && noise.size() == 0) { + if(processedIDs.size() == relation.size() && noise.size() == 0) { break; } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java index ca401ddc..f1e6c945 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java index c1285659..a70a3f6f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -25,9 +25,10 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; import java.util.ArrayList; import java.util.List; -import java.util.Random; import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeansInitialization; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.RandomlyGeneratedInitialMeans; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; @@ -42,6 +43,7 @@ import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.MathUtil; import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; @@ -58,8 +60,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualCons import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter; -import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** * Provides the EM algorithm (clustering by expectation maximization). @@ -113,6 +114,11 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri */ public static final OptionID DELTA_ID = OptionID.getOrCreateOptionID("em.delta", "The termination criterion for maximization of E(M): " + "E(M) - E(M') < em.delta"); + /** + * Parameter to specify the initialization method + */ + public static final OptionID INIT_ID = OptionID.getOrCreateOptionID("kmeans.initialization", "Method to choose the initial means."); + private static final double MIN_LOGLIKELIHOOD = -100000; /** @@ -121,32 +127,27 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri private double delta; /** - * Parameter to specify the random generator seed. + * Store the individual probabilities, for use by EMOutlierDetection etc. */ - public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("em.seed", "The random number generator seed."); + private WritableDataStore<double[]> probClusterIGivenX; /** - * Holds the value of {@link #SEED_ID}. + * Class to choose the initial means */ - private Long seed; - - /** - * Store the individual probabilities, for use by EMOutlierDetection etc. - */ - private WritableDataStore<double[]> probClusterIGivenX; + private KMeansInitialization<V> initializer; /** * Constructor. * * @param k k parameter * @param delta delta parameter - * @param seed Seed parameter + * @param initializer Class to choose the initial means */ - public EM(int k, double delta, Long seed) { + public EM(int k, double delta, KMeansInitialization<V> initializer) { super(); this.k = k; this.delta = delta; - this.seed = seed; + this.initializer = initializer; } /** @@ -169,14 +170,14 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri if(logger.isVerbose()) { logger.verbose("initializing " + k + " models"); } - List<V> means = initialMeans(relation); + List<Vector> means = initializer.chooseInitialMeans(relation, k, EuclideanDistanceFunction.STATIC); List<Matrix> covarianceMatrices = new ArrayList<Matrix>(k); List<Double> normDistrFactor = new ArrayList<Double>(k); List<Matrix> invCovMatr = new ArrayList<Matrix>(k); List<Double> clusterWeights = new ArrayList<Double>(k); probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class); - int dimensionality = means.get(0).getDimensionality(); + final int dimensionality = means.get(0).getDimensionality(); for(int i = 0; i < k; i++) { Matrix m = Matrix.identity(dimensionality, dimensionality); covarianceMatrices.add(m); @@ -211,12 +212,12 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri em = emNew; // recompute models - List<V> meanSums = new ArrayList<V>(k); + List<Vector> meanSums = new ArrayList<Vector>(k); double[] sumOfClusterProbabilities = new double[k]; for(int i = 0; i < k; i++) { clusterWeights.set(i, 0.0); - meanSums.add(means.get(i).nullVector()); + meanSums.add(new Vector(dimensionality)); covarianceMatrices.set(i, Matrix.zeroMatrix(dimensionality)); } @@ -226,24 +227,23 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri for(int i = 0; i < k; i++) { sumOfClusterProbabilities[i] += clusterProbabilities[i]; - V summand = relation.get(id).multiplicate(clusterProbabilities[i]); - V currentMeanSum = meanSums.get(i).plus(summand); - meanSums.set(i, currentMeanSum); + Vector summand = relation.get(id).getColumnVector().timesEquals(clusterProbabilities[i]); + meanSums.get(i).plusEquals(summand); } } final int n = relation.size(); for(int i = 0; i < k; i++) { clusterWeights.set(i, sumOfClusterProbabilities[i] / n); - V newMean = meanSums.get(i).multiplicate(1 / sumOfClusterProbabilities[i]); + Vector newMean = meanSums.get(i).timesEquals(1 / sumOfClusterProbabilities[i]); means.set(i, newMean); } // covariance matrices for(DBID id : relation.iterDBIDs()) { double[] clusterProbabilities = probClusterIGivenX.get(id); - V instance = relation.get(id); + Vector instance = relation.get(id).getColumnVector(); for(int i = 0; i < k; i++) { - V difference = instance.minus(means.get(i)); - covarianceMatrices.get(i).plusEquals(difference.getColumnVector().times(difference.getRowVector()).times(clusterProbabilities[i])); + Vector difference = instance.minus(means.get(i)); + covarianceMatrices.get(i).plusEquals(difference.timesTranspose(difference).timesEquals(clusterProbabilities[i])); } } for(int i = 0; i < k; i++) { @@ -281,13 +281,14 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri } hardClusters.get(maxIndex).add(id); } + final V factory = DatabaseUtil.assumeVectorField(relation).getFactory(); Clustering<EMModel<V>> result = new Clustering<EMModel<V>>("EM Clustering", "em-clustering"); // provide models within the result for(int i = 0; i < k; i++) { // TODO: re-do labeling. // SimpleClassLabel label = new SimpleClassLabel(); // label.init(result.canonicalClusterLabel(i)); - Cluster<EMModel<V>> model = new Cluster<EMModel<V>>(hardClusters.get(i), new EMModel<V>(means.get(i), covarianceMatrices.get(i))); + Cluster<EMModel<V>> model = new Cluster<EMModel<V>>(hardClusters.get(i), new EMModel<V>(factory.newNumberVector(means.get(i).getArrayRef()), covarianceMatrices.get(i))); result.addCluster(model); } return result; @@ -308,24 +309,20 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri * @param clusterWeights the weights of the current clusters * @return the expectation value of the current mixture of distributions */ - protected double assignProbabilitiesToInstances(Relation<V> database, List<Double> normDistrFactor, List<V> means, List<Matrix> invCovMatr, List<Double> clusterWeights, WritableDataStore<double[]> probClusterIGivenX) { + protected double assignProbabilitiesToInstances(Relation<V> database, List<Double> normDistrFactor, List<Vector> means, List<Matrix> invCovMatr, List<Double> clusterWeights, WritableDataStore<double[]> probClusterIGivenX) { double emSum = 0.0; for(DBID id : database.iterDBIDs()) { - V x = database.get(id); + Vector x = database.get(id).getColumnVector(); List<Double> probabilities = new ArrayList<Double>(k); for(int i = 0; i < k; i++) { - V difference = x.minus(means.get(i)); - Matrix differenceRow = difference.getRowVector(); - Vector differenceCol = difference.getColumnVector(); - Matrix rowTimesCov = differenceRow.times(invCovMatr.get(i)); - Vector rowTimesCovTimesCol = rowTimesCov.times(differenceCol); - double power = rowTimesCovTimesCol.get(0, 0) / 2.0; + Vector difference = x.minus(means.get(i)); + double rowTimesCovTimesCol = difference.transposeTimesTimes(invCovMatr.get(i), difference); + double power = rowTimesCovTimesCol / 2.0; double prob = normDistrFactor.get(i) * Math.exp(-power); if(logger.isDebuggingFinest()) { - logger.debugFinest(" difference vector= ( " + difference.toString() + " )\n" + " differenceRow:\n" + FormatUtil.format(differenceRow, " ") + "\n" + " differenceCol:\n" + FormatUtil.format(differenceCol, " ") + "\n" + " rowTimesCov:\n" + FormatUtil.format(rowTimesCov, " ") + "\n" + " rowTimesCovTimesCol:\n" + FormatUtil.format(rowTimesCovTimesCol, " ") + "\n" + " power= " + power + "\n" + " prob=" + prob + "\n" + " inv cov matrix: \n" + FormatUtil.format(invCovMatr.get(i), " ")); + logger.debugFinest(" difference vector= ( " + difference.toString() + " )\n" + " difference:\n" + FormatUtil.format(difference, " ") + "\n" + " rowTimesCovTimesCol:\n" + rowTimesCovTimesCol + "\n" + " power= " + power + "\n" + " prob=" + prob + "\n" + " inv cov matrix: \n" + FormatUtil.format(invCovMatr.get(i), " ")); } - probabilities.add(prob); } double priorProbability = 0.0; @@ -356,48 +353,6 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri } /** - * Creates {@link #k k} random points distributed uniformly within the - * attribute ranges of the given database. - * - * @param relation the database must contain enough points in order to - * ascertain the range of attribute values. Less than two points would - * make no sense. The content of the database is not touched otherwise. - * @return a list of {@link #k k} random points distributed uniformly within - * the attribute ranges of the given database - */ - protected List<V> initialMeans(Relation<V> relation) { - final Random random; - if(this.seed != null) { - random = new Random(this.seed); - } - else { - random = new Random(); - } - if(relation.size() > 0) { - final int dim = DatabaseUtil.dimensionality(relation); - Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation); - List<V> means = new ArrayList<V>(k); - if(logger.isVerbose()) { - logger.verbose("initializing random vectors"); - } - for(int i = 0; i < k; i++) { - double[] r = MathUtil.randomDoubleArray(dim, random); - // Rescale - for (int d = 0; d < dim; d++) { - r[d] = minmax.first.doubleValue(d + 1) + (minmax.second.doubleValue(d + 1) - minmax.first.doubleValue(d + 1)) * r[d]; - } - // Instantiate - V randomVector = minmax.first.newInstance(r); - means.add(randomVector); - } - return means; - } - else { - return new ArrayList<V>(0); - } - } - - /** * Get the probabilities for a given point. * * @param index Point ID @@ -429,7 +384,7 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri protected double delta; - protected Long seed; + protected KMeansInitialization<V> initializer; @Override protected void makeOptions(Parameterization config) { @@ -439,20 +394,20 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri k = kP.getValue(); } + ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class); + if(config.grab(initialP)) { + initializer = initialP.instantiateClass(config); + } + DoubleParameter deltaP = new DoubleParameter(DELTA_ID, new GreaterEqualConstraint(0.0), 0.0); if(config.grab(deltaP)) { delta = deltaP.getValue(); } - - LongParameter seedP = new LongParameter(SEED_ID, true); - if(config.grab(seedP)) { - seed = seedP.getValue(); - } } @Override protected EM<V> makeInstance() { - return new EM<V>(k, delta, seed); + return new EM<V>(k, delta, initializer); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/KMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/KMeans.java deleted file mode 100644 index 38ea89c2..00000000 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/KMeans.java +++ /dev/null @@ -1,307 +0,0 @@ -package de.lmu.ifi.dbs.elki.algorithm.clustering; - -/* - This file is part of ELKI: - Environment for Developing KDD-Applications Supported by Index-Structures - - Copyright (C) 2011 - Ludwig-Maximilians-Universität München - Lehr- und Forschungseinheit für Datenbanksysteme - ELKI Development Team - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -import java.util.ArrayList; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; -import java.util.Random; - -import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; -import de.lmu.ifi.dbs.elki.data.Cluster; -import de.lmu.ifi.dbs.elki.data.Clustering; -import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.data.model.MeanModel; -import de.lmu.ifi.dbs.elki.data.type.TypeInformation; -import de.lmu.ifi.dbs.elki.data.type.TypeUtil; -import de.lmu.ifi.dbs.elki.database.Database; -import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.ids.DBID; -import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; -import de.lmu.ifi.dbs.elki.database.ids.DBIDs; -import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; -import de.lmu.ifi.dbs.elki.logging.Logging; -import de.lmu.ifi.dbs.elki.math.MathUtil; -import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; -import de.lmu.ifi.dbs.elki.utilities.documentation.Description; -import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; -import de.lmu.ifi.dbs.elki.utilities.documentation.Title; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter; -import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; - -/** - * Provides the k-means algorithm. - * <p> - * Reference: J. MacQueen: Some Methods for Classification and Analysis of - * Multivariate Observations. <br> - * In 5th Berkeley Symp. Math. Statist. Prob., Vol. 1, 1967, pp 281-297. - * </p> - * - * @author Arthur Zimek - * - * @apiviz.has MeanModel - * - * @param <D> a type of {@link Distance} as returned by the used distance - * function - * @param <V> a type of {@link NumberVector} as a suitable datatype for this - * algorithm - */ -@Title("K-Means") -@Description("Finds a partitioning into k clusters.") -@Reference(authors = "J. MacQueen", title = "Some Methods for Classification and Analysis of Multivariate Observations", booktitle = "5th Berkeley Symp. Math. Statist. Prob., Vol. 1, 1967, pp 281-297", url = "http://projecteuclid.org/euclid.bsmsp/1200512992") -public class KMeans<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm<V, D, Clustering<MeanModel<V>>> implements ClusteringAlgorithm<Clustering<MeanModel<V>>> { - /** - * The logger for this class. - */ - private static final Logging logger = Logging.getLogger(KMeans.class); - - /** - * Parameter to specify the number of clusters to find, must be an integer - * greater than 0. - */ - public static final OptionID K_ID = OptionID.getOrCreateOptionID("kmeans.k", "The number of clusters to find."); - - /** - * Parameter to specify the number of clusters to find, must be an integer - * greater or equal to 0, where 0 means no limit. - */ - public static final OptionID MAXITER_ID = OptionID.getOrCreateOptionID("kmeans.maxiter", "The maximum number of iterations to do. 0 means no limit."); - - /** - * Parameter to specify the random generator seed. - */ - public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("kmeans.seed", "The random number generator seed."); - - /** - * Holds the value of {@link #K_ID}. - */ - private int k; - - /** - * Holds the value of {@link #MAXITER_ID}. - */ - private int maxiter; - - /** - * Holds the value of {@link #SEED_ID}. - */ - private Long seed; - - /** - * Constructor. - * - * @param distanceFunction distance function - * @param k k parameter - * @param maxiter Maxiter parameter - * @param seed Random generator seed - */ - public KMeans(PrimitiveDistanceFunction<? super V, D> distanceFunction, int k, int maxiter, Long seed) { - super(distanceFunction); - this.k = k; - this.maxiter = maxiter; - this.seed = seed; - } - - /** - * Run k-means - * - * @param database Database - * @param relation relation to use - * @return result - * @throws IllegalStateException - */ - public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) throws IllegalStateException { - final Random random = (this.seed != null) ? new Random(this.seed) : new Random(); - if(relation.size() > 0) { - final int dim = DatabaseUtil.dimensionality(relation); - Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation); - List<V> means = new ArrayList<V>(k); - List<V> oldMeans; - if(logger.isVerbose()) { - logger.verbose("initializing random vectors"); - } - for(int i = 0; i < k; i++) { - double[] r = MathUtil.randomDoubleArray(dim, random); - // Rescale - for (int d = 0; d < dim; d++) { - r[d] = minmax.first.doubleValue(d + 1) + (minmax.second.doubleValue(d + 1) - minmax.first.doubleValue(d + 1)) * r[d]; - } - // Instantiate - V randomVector = minmax.first.newInstance(r); - means.add(randomVector); - } - List<? extends ModifiableDBIDs> clusters; - clusters = sort(means, relation); - boolean changed = true; - int iteration = 1; - while(changed) { - if(logger.isVerbose()) { - logger.verbose("iteration " + iteration); - } - oldMeans = new ArrayList<V>(means); - means = means(clusters, means, relation); - clusters = sort(means, relation); - changed = !means.equals(oldMeans); - iteration++; - - if(maxiter > 0 && iteration > maxiter) { - break; - } - } - Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering"); - for(int i = 0; i < clusters.size(); i++) { - DBIDs ids = clusters.get(i); - MeanModel<V> model = new MeanModel<V>(means.get(i)); - result.addCluster(new Cluster<MeanModel<V>>(ids, model)); - } - return result; - } - else { - return new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering"); - } - } - - /** - * Returns the mean vectors of the given clusters in the given database. - * - * @param clusters the clusters to compute the means - * @param means the recent means - * @param database the database containing the vectors - * @return the mean vectors of the given clusters in the given database - */ - protected List<V> means(List<? extends ModifiableDBIDs> clusters, List<V> means, Relation<V> database) { - List<V> newMeans = new ArrayList<V>(k); - for(int i = 0; i < k; i++) { - ModifiableDBIDs list = clusters.get(i); - V mean = null; - for(Iterator<DBID> clusterIter = list.iterator(); clusterIter.hasNext();) { - if(mean == null) { - mean = database.get(clusterIter.next()); - } - else { - mean = mean.plus(database.get(clusterIter.next())); - } - } - if(list.size() > 0) { - assert mean != null; - mean = mean.multiplicate(1.0 / list.size()); - } - else { - mean = means.get(i); - } - newMeans.add(mean); - } - return newMeans; - } - - /** - * Returns a list of clusters. The k<sup>th</sup> cluster contains the ids of - * those FeatureVectors, that are nearest to the k<sup>th</sup> mean. - * - * @param means a list of k means - * @param database the database to cluster - * @return list of k clusters - */ - protected List<? extends ModifiableDBIDs> sort(List<V> means, Relation<V> database) { - List<ArrayModifiableDBIDs> clusters = new ArrayList<ArrayModifiableDBIDs>(k); - for(int i = 0; i < k; i++) { - clusters.add(DBIDUtil.newArray()); - } - - for(DBID id : database.iterDBIDs()) { - List<D> distances = new ArrayList<D>(k); - V fv = database.get(id); - int minIndex = 0; - for(int d = 0; d < k; d++) { - distances.add(getDistanceFunction().distance(fv, means.get(d))); - if(distances.get(d).compareTo(distances.get(minIndex)) < 0) { - minIndex = d; - } - } - clusters.get(minIndex).add(id); - } - for(ArrayModifiableDBIDs cluster : clusters) { - Collections.sort(cluster); - } - return clusters; - } - - @Override - public TypeInformation[] getInputTypeRestriction() { - return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); - } - - @Override - protected Logging getLogger() { - return logger; - } - - /** - * Parameterization class. - * - * @author Erich Schubert - * - * @apiviz.exclude - */ - public static class Parameterizer<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<V, D> { - protected int k; - - protected int maxiter; - - protected Long seed; - - @Override - protected void makeOptions(Parameterization config) { - super.makeOptions(config); - IntParameter kP = new IntParameter(K_ID, new GreaterConstraint(0)); - if(config.grab(kP)) { - k = kP.getValue(); - } - - IntParameter maxiterP = new IntParameter(MAXITER_ID, new GreaterEqualConstraint(0), 0); - if(config.grab(maxiterP)) { - maxiter = maxiterP.getValue(); - } - - LongParameter seedP = new LongParameter(SEED_ID, true); - if(config.grab(seedP)) { - seed = seedP.getValue(); - } - } - - @Override - protected KMeans<V, D> makeInstance() { - return new KMeans<V, D>(distanceFunction, k, maxiter, seed); - } - } -}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java index 24985e24..2244b07b 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java index c233963d..d6c5872a 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java index f7bd10c7..41e48b89 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java index e1329888..45b12c43 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -133,18 +133,18 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori public Result run(Database database, Relation<O> relation) { DistanceQuery<O, D> distQuery = database.getDistanceQuery(relation, getDistanceFunction()); Class<D> distCls = (Class<D>) getDistanceFunction().getDistanceFactory().getClass(); - WritableRecordStore store = DataStoreUtil.makeRecordStorage(distQuery.getRelation().getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, DBID.class, distCls); + WritableRecordStore store = DataStoreUtil.makeRecordStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, DBID.class, distCls); pi = store.getStorage(0, DBID.class); lambda = store.getStorage(1, distCls); // Temporary storage for m. - WritableDataStore<D> m = DataStoreUtil.makeStorage(distQuery.getRelation().getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, distCls); + WritableDataStore<D> m = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, distCls); - FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Clustering", distQuery.getRelation().size(), logger) : null; + FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Clustering", relation.size(), logger) : null; // has to be an array for monotonicity reasons! - ModifiableDBIDs processedIDs = DBIDUtil.newArray(distQuery.getRelation().size()); + ModifiableDBIDs processedIDs = DBIDUtil.newArray(relation.size()); // apply the algorithm - for(DBID id : distQuery.getRelation().iterDBIDs()) { + for(DBID id : relation.iterDBIDs()) { step1(id); step2(id, processedIDs, distQuery, m); step3(id, processedIDs, m); @@ -168,8 +168,8 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori BasicResult result = null; // Build clusters identified by their target object - int minc = minclusters != null ? minclusters : distQuery.getRelation().size(); - result = extractClusters(distQuery.getRelation().getDBIDs(), pi, lambda, minc); + int minc = minclusters != null ? minclusters : relation.size(); + result = extractClusters(relation.getDBIDs(), pi, lambda, minc); result.addChildResult(new MaterializedRelation<DBID>("SLINK pi", "slink-order", TypeUtil.DBID, pi, processedIDs)); result.addChildResult(new MaterializedRelation<D>("SLINK lambda", "slink-order", new SimpleTypeInformation<D>(distCls), lambda, processedIDs)); @@ -288,7 +288,7 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori D stopdist = null; // sort by lambda ArrayModifiableDBIDs order = DBIDUtil.newArray(ids); - Collections.sort(order, new CompareByLambda<D>(lambda)); + order.sort(new CompareByLambda<D>(lambda)); int index = ids.size() - minclusters - 1; while(index >= 0) { if(lambda.get(order.get(index)).equals(lambda.get(order.get(index + 1)))) { @@ -458,7 +458,7 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori // extract a hierarchical clustering ArrayModifiableDBIDs order = DBIDUtil.newArray(ids); // sort by lambda - Collections.sort(order, new CompareByLambda<D>(lambda)); + order.sort(new CompareByLambda<D>(lambda)); D curdist = null; D stopdist = null; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java index 3bde2932..7c3a13c9 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -25,7 +25,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; import java.util.ArrayList; import java.util.Iterator; -import java.util.LinkedList; import java.util.List; import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; @@ -36,6 +35,7 @@ import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; @@ -200,8 +200,8 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple * @return the shared nearest neighbors of the specified query object in the * given database */ - protected List<DBID> findSNNNeighbors(SimilarityQuery<O, IntegerDistance> snnInstance, DBID queryObject) { - List<DBID> neighbors = new LinkedList<DBID>(); + protected ArrayModifiableDBIDs findSNNNeighbors(SimilarityQuery<O, IntegerDistance> snnInstance, DBID queryObject) { + ArrayModifiableDBIDs neighbors = DBIDUtil.newArray(); for(DBID id : snnInstance.getRelation().iterDBIDs()) { if(snnInstance.similarity(queryObject, id).compareTo(epsilon) >= 0) { neighbors.add(id); @@ -222,7 +222,7 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple * clustering */ protected void expandCluster(SimilarityQuery<O, IntegerDistance> snnInstance, DBID startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) { - List<DBID> seeds = findSNNNeighbors(snnInstance, startObjectID); + ArrayModifiableDBIDs seeds = findSNNNeighbors(snnInstance, startObjectID); // startObject is no core-object if(seeds.size() < minpts) { @@ -247,11 +247,10 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple noise.remove(seed); } } - seeds.remove(0); while(seeds.size() > 0) { - DBID o = seeds.remove(0); - List<DBID> neighborhood = findSNNNeighbors(snnInstance, o); + DBID o = seeds.remove(seeds.size() - 1); + ArrayModifiableDBIDs neighborhood = findSNNNeighbors(snnInstance, o); if(neighborhood.size() >= minpts) { for(DBID p : neighborhood) { diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java index 4a0b391c..b877415e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -55,7 +55,7 @@ import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.datasource.filter.NonNumericFeaturesException; +import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException; import de.lmu.ifi.dbs.elki.distance.distancefunction.WeightedDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.logging.Logging; @@ -84,7 +84,12 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; /** * Provides the CASH algorithm, an subspace clustering algorithm based on the - * hough transform. + * Hough transform. + * + * <b>Note:</b> CASH requires explicitly setting the input parser other than default to + * {@link de.lmu.ifi.dbs.elki.datasource.parser.ParameterizationFunctionLabelParser}: + * (in the MiniGui, set option: dbc.parser ParameterizationFunctionLabelParser). + * * <p> * Reference: E. Achtert, C. Böhm, J. David, P. Kröger, A. Zimek: Robust * clustering in arbitrarily oriented subspaces. <br> @@ -99,7 +104,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; */ // todo elke hierarchy (later) @Title("CASH: Robust clustering in arbitrarily oriented subspaces") -@Description("Subspace clustering algorithm based on the hough transform.") +@Description("Subspace clustering algorithm based on the Hough transform.") @Reference(authors = "E. Achtert, C. Böhm, J. David, P. Kröger, A. Zimek", title = "Robust clustering in arbitraily oriented subspaces", booktitle = "Proc. 8th SIAM Int. Conf. on Data Mining (SDM'08), Atlanta, GA, 2008", url = "http://www.siam.org/proceedings/datamining/2008/dm08_69_AchtertBoehmDavidKroegerZimek.pdf") public class CASH extends AbstractAlgorithm<Clustering<Model>> implements ClusteringAlgorithm<Clustering<Model>> { /** @@ -349,7 +354,7 @@ public class CASH extends AbstractAlgorithm<Clustering<Model>> implements Cluste res.addCluster(c); noiseIDs.removeDBIDs(interval.getIDs()); clusterIDs.addDBIDs(interval.getIDs()); - processedIDs.addAll(interval.getIDs()); + processedIDs.addDBIDs(interval.getIDs()); } // Rebuild heap @@ -372,13 +377,13 @@ public class CASH extends AbstractAlgorithm<Clustering<Model>> implements Cluste if(dim == noiseDim) { Cluster<Model> c = new Cluster<Model>(noiseIDs, true, ClusterModel.CLUSTER); res.addCluster(c); - processedIDs.addAll(noiseIDs); + processedIDs.addDBIDs(noiseIDs); } else if(noiseIDs.size() >= minPts) { LinearEquationSystem les = runDerivator(fulldatabase, dim - 1, noiseIDs); Cluster<Model> c = new Cluster<Model>(noiseIDs, true, new LinearEquationModel(les)); res.addCluster(c); - processedIDs.addAll(noiseIDs); + processedIDs.addDBIDs(noiseIDs); } } @@ -521,7 +526,7 @@ public class CASH extends AbstractAlgorithm<Clustering<Model>> implements Cluste private ParameterizationFunction project(Matrix basis, ParameterizationFunction f) { // Matrix m = new Matrix(new // double[][]{f.getPointCoordinates()}).times(basis); - Matrix m = f.getRowVector().times(basis); + Matrix m = f.getColumnVector().transposeTimes(basis); ParameterizationFunction f_t = new ParameterizationFunction(m.getColumnPackedCopy()); return f_t; } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java index 8fc30b3d..575bf117 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java index 75633853..af4f677f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -244,7 +244,7 @@ public class ERiC<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste } else { ModifiableDBIDs merged = DBIDUtil.newHashSet(noise.getIDs()); - merged.addAll(clus.getIDs().asCollection()); + merged.addDBIDs(clus.getIDs()); noise.setIDs(merged); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/FourC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/FourC.java index 93d0cc99..98761962 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/FourC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/FourC.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java index 92723428..1065682c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java new file mode 100644 index 00000000..41ee1f69 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java @@ -0,0 +1,566 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Random; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.Model; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; +import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.math.histograms.FlexiHistogram; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.exceptions.UnableToComplyException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleObjPair; + +/** + * Linear manifold clustering in high dimensional spaces by stochastic search. + * + * Reference: + * <p> + * Robert Haralick, Rave Harpaz<br /> + * Linear manifold clustering in high dimensional spaces by stochastic search<br/> + * In: Pattern Recognition volume 40, Issue 10 + * </p> + * + * Implementation note: the LMCLUS algorithm seems to lack good stopping + * criterions. We can't entirely reproduce the good results from the original + * publication, in particular not on noisy data. But the questionable parts are + * as in the original publication, associated thesis and published source code. + * The minimum cluster size however can serve as a hidden stopping criterion. + * + * @author Ernst Waas + * @author Erich Schubert + */ +@Reference(authors = "Robert Haralick, Rave Harpaz", title = "Linear manifold clustering in high dimensional spaces by stochastic search", booktitle = "Pattern Recognition volume 40, Issue 10", url = "http://dx.doi.org/10.1016/j.patcog.2007.01.020") +public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(LMCLUS.class); + + /** + * Epsilon + */ + private final static double NOT_FROM_ONE_CLUSTER_PROBABILITY = 0.2; + + /** + * Histogram resolution + */ + private final static int BINS = 50; + + /** + * The current threshold value calculated by the findSeperation Method. + */ + private final double sensitivityThreshold; + + /** + * Maximum cluster dimensionality + */ + private final int maxLMDim; + + /** + * Minimum cluster size + */ + private final int minsize; + + /** + * Number of sampling rounds to find a good split + */ + private final int samplingLevel; + + /** + * Constructor. + * + * @param maxdim Maximum dimensionality + * @param minsize Minimum cluster size + * @param samplingLevel Sampling level + * @param sensitivityThreshold Threshold + */ + public LMCLUS(int maxdim, int minsize, int samplingLevel, double sensitivityThreshold) { + super(); + this.maxLMDim = maxdim; + this.minsize = minsize; + this.samplingLevel = samplingLevel; + this.sensitivityThreshold = sensitivityThreshold; + } + + /** + * The main LMCLUS (Linear manifold clustering algorithm) is processed in this + * method. + * + * <PRE> + * The algorithm samples random linear manifolds and tries to find clusters in it. + * It calculates a distance histogram searches for a threshold and partitions the + * points in two groups the ones in the cluster and everything else. + * Then the best fitting linear manifold is searched and registered as a cluster. + * The process is started over until all points are clustered. + * The last cluster should contain all the outliers. (or the whole data if no clusters have been found.) + * For details see {@link LMCLUS}. + * </PRE> + * + * @param database The database to operate on + * @param relation Relation + * @return Clustering result + * @throws de.lmu.ifi.dbs.elki.utilities.UnableToComplyException + */ + public Clustering<Model> run(Database database, Relation<NumberVector<?, ?>> relation) throws UnableToComplyException { + Clustering<Model> ret = new Clustering<Model>("LMCLUS Clustering", "lmclus-clustering"); + FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Clustered objects", relation.size(), logger) : null; + IndefiniteProgress cprogress = logger.isVerbose() ? new IndefiniteProgress("Clusters found", logger) : null; + ModifiableDBIDs unclustered = DBIDUtil.newHashSet(relation.getDBIDs()); + + final int maxdim = Math.min(maxLMDim, DatabaseUtil.dimensionality(relation)); + int cnum = 0; + while(unclustered.size() > minsize) { + DBIDs current = unclustered; + int lmDim = 1; + for(int k = 1; k <= maxdim; k++) { + // Implementation note: this while loop is from the original publication + // and the published LMCLUS source code. It doesn't make sense to me - + // it is lacking a stop criterion other than "cluster is too small" and + // "cluster is inseparable"! Additionally, there is good criterion for + // stopping at the appropriate dimensionality either. + while(true) { + Separation separation = findSeparation(relation, current, k); + // logger.verbose("k: " + k + " goodness: " + separation.goodness + + // " threshold: " + separation.threshold); + if(separation.goodness <= sensitivityThreshold) { + break; + } + ModifiableDBIDs subset = DBIDUtil.newArray(current.size()); + for(DBID id : current) { + if(deviation(relation.get(id).getColumnVector().minusEquals(separation.originV), separation.basis) < separation.threshold) { + subset.add(id); + } + } + // logger.verbose("size:"+subset.size()); + if(subset.size() < minsize) { + break; + } + current = subset; + lmDim = k; + // System.out.println("Partition: " + subset.size()); + } + } + // No more clusters found + if(current.size() < minsize || current == unclustered) { + break; + } + // New cluster found + // TODO: annotate cluster with dimensionality + final Cluster<Model> cluster = new Cluster<Model>(current); + cluster.setName("Cluster_" + lmDim + "d_" + cnum); + cnum++; + ret.addCluster(cluster); + // Remove from main working set. + unclustered.removeDBIDs(current); + if(progress != null) { + progress.setProcessed(relation.size() - unclustered.size(), logger); + } + if(cprogress != null) { + cprogress.setProcessed(cnum, logger); + } + } + // Remaining objects are noise + if(unclustered.size() > 0) { + ret.addCluster(new Cluster<Model>(unclustered, true)); + } + if(progress != null) { + progress.setProcessed(relation.size(), logger); + progress.ensureCompleted(logger); + } + if(cprogress != null) { + cprogress.setCompleted(logger); + } + return ret; + } + + /** + * Deviation from a manifold described by beta. + * + * @param delta Delta from origin vector + * @param beta Manifold + * @return Deviation score + */ + private double deviation(Vector delta, Matrix beta) { + double a = delta.euclideanLength(); + double b = beta.transposeTimes(delta).euclideanLength(); + return Math.sqrt((a * a) - (b * b)); + } + + /** + * This method samples a number of linear manifolds an tries to determine + * which the one with the best cluster is. + * + * <PRE> + * A number of sample points according to the dimension of the linear manifold are taken. + * The basis (B) and the origin(o) of the manifold are calculated. + * A distance histogram using the distance function ||x-o|| -||B^t*(x-o)|| is generated. + * The best threshold is searched using the elevate threshold function. + * The overall goodness of the threshold is determined. + * The process is redone until a specific number of samples is taken. + * </PRE> + * + * @param relation The vector relation + * @param currentids Current DBIDs + * @param dimension the dimension of the linear manifold to sample. + * @return the overall goodness of the separation. The values origin basis and + * threshold are returned indirectly over class variables. + */ + private Separation findSeparation(Relation<NumberVector<?, ?>> relation, DBIDs currentids, int dimension) { + Separation separation = new Separation(); + // determine the number of samples needed, to secure that with a specific + // probability + // in at least on sample every sampled point is from the same cluster. + int samples = (int) Math.min(Math.log(NOT_FROM_ONE_CLUSTER_PROBABILITY) / (Math.log(1 - Math.pow((1.0d / samplingLevel), dimension))), (double) currentids.size()); + // System.out.println("Number of samples: " + samples); + Random r = new Random(); + int remaining_retries = 100; + for(int i = 1; i <= samples; i++) { + DBIDs sample = DBIDUtil.randomSample(currentids, dimension + 1, r.nextLong()); + final Iterator<DBID> iter = sample.iterator(); + // Use first as origin + DBID origin = iter.next(); + Vector originV = relation.get(origin).getColumnVector(); + // Build orthogonal basis from remainder + Matrix basis; + { + List<Vector> vectors = new ArrayList<Vector>(sample.size() - 1); + while(iter.hasNext()) { + Vector vec = relation.get(iter.next()).getColumnVector(); + vectors.add(vec.minusEquals(originV)); + } + // generate orthogonal basis + basis = generateOrthonormalBasis(vectors); + if(basis == null) { + // new sample has to be taken. + i--; + remaining_retries--; + if(remaining_retries < 0) { + throw new AbortException("Too many retries in sampling, and always a linear dependant data set."); + } + continue; + } + } + // Generate and fill a histogram. + FlexiHistogram<Double, Double> histogram = FlexiHistogram.DoubleSumHistogram(BINS); + double w = 1.0 / currentids.size(); + for(DBID point : currentids) { + // Skip sampled points + if(sample.contains(point)) { + continue; + } + Vector vec = relation.get(point).getColumnVector().minusEquals(originV); + final double distance = deviation(vec, basis); + histogram.aggregate(distance, w); + } + double[] th = findAndEvaluateThreshold(histogram); // evaluate threshold + if(th[1] > separation.goodness) { + separation.goodness = th[1]; + separation.threshold = th[0]; + separation.originV = originV; + separation.basis = basis; + } + } + return separation; + } + + /** + * This Method generates an orthonormal basis from a set of Vectors. It uses + * the established Gram-Schmidt algorithm for orthonormalisation: + * + * <PRE> + * u_1 = v_1 + * u_k = v_k -proj_u1(v_k)...proj_u(k-1)(v_k); + * + * Where proj_u(v) = <v,u>/<u,u> *u + * </PRE> + * + * @param vectors The set of vectors to generate the orthonormal basis from + * @return the orthonormal basis generated by this method. + * @throws RuntimeException if the given vectors are not linear independent. + */ + private Matrix generateOrthonormalBasis(List<Vector> vectors) { + Vector first = vectors.get(0); + first = first.times(1.0 / first.euclideanLength()); + Matrix ret = new Matrix(first.getDimensionality(), vectors.size()); + ret.setCol(0, first); + for(int i = 1; i < vectors.size(); i++) { + // System.out.println("Matrix:" + ret); + Vector v_i = vectors.get(i); + Vector u_i = v_i.copy(); + // System.out.println("Vector " + i + ":" + partialSol); + for(int j = 0; j < i; j++) { + Vector v_j = ret.getCol(j); + double f = v_i.transposeTimes(v_j) / v_j.transposeTimes(v_j); + if(Double.isNaN(f)) { + if(logger.isDebuggingFine()) { + logger.debugFine("Zero vector encountered? " + v_j); + } + return null; + } + u_i.minusTimesEquals(v_j, f); + } + // check if the vectors weren't independent + final double len_u_i = u_i.euclideanLength(); + if(len_u_i == 0.0) { + if(logger.isDebuggingFine()) { + logger.debugFine("Points not independent - no orthonormalization."); + } + return null; + } + // System.out.println("Vector " + i + ":" + partialSol); + u_i.timesEquals(1 / len_u_i); + ret.setCol(i, u_i); + } + return ret; + } + + /** + * Evaluate the histogram to find a suitable threshold + * + * @param histogram Histogram to evaluate + * @return Position and goodness + */ + private double[] findAndEvaluateThreshold(FlexiHistogram<Double, Double> histogram) { + int n = histogram.getNumBins(); + double[] p1 = new double[n]; + double[] p2 = new double[n]; + double[] mu1 = new double[n]; + double[] mu2 = new double[n]; + double[] sigma1 = new double[n]; + double[] sigma2 = new double[n]; + double[] jt = new double[n]; + // Forward pass + { + MeanVariance mv = new MeanVariance(); + Iterator<DoubleObjPair<Double>> forward = histogram.iterator(); + for(int i = 0; forward.hasNext(); i++) { + DoubleObjPair<Double> pair = forward.next(); + p1[i] = pair.second + ((i > 0) ? p1[i - 1] : 0); + mv.put(i, pair.second); + mu1[i] = mv.getMean(); + sigma1[i] = mv.getNaiveStddev(); + } + } + // Backwards pass + { + MeanVariance mv = new MeanVariance(); + Iterator<DoubleObjPair<Double>> backwards = histogram.reverseIterator(); + for(int j = n - 1; backwards.hasNext(); j--) { + DoubleObjPair<Double> pair = backwards.next(); + p2[j] = pair.second + ((j + 1 < n) ? p2[j + 1] : 0); + mv.put(j, pair.second); + mu2[j] = mv.getMean(); + sigma2[j] = mv.getNaiveStddev(); + } + } + + for(int i = 0; i < n; i++) { + jt[i] = 1.0 + 2 * (p1[i] * (Math.log(sigma1[i]) - Math.log(p1[i])) + p2[i] * (Math.log(sigma2[i]) - Math.log(p2[i]))); + } + + int bestpos = -1; + double bestgoodness = Double.NEGATIVE_INFINITY; + + double devPrev = jt[1] - jt[0]; + for(int i = 1; i < jt.length - 1; i++) { + double devCur = jt[i + 1] - jt[i]; + // System.out.println(p1[i]); + // System.out.println(jt[i + 1]); + // System.out.println(jt[i]); + // System.out.println(devCur); + // Local minimum found - calculate depth + if(devCur >= 0 && devPrev <= 0) { + double lowestMaxima = Double.POSITIVE_INFINITY; + for(int j = i - 1; j > 0; j--) { + if(jt[j - 1] < jt[j]) { + lowestMaxima = Math.min(lowestMaxima, jt[j]); + break; + } + } + for(int j = i + 1; j < n - 2; j++) { + if(jt[j + 1] < jt[j]) { + lowestMaxima = Math.min(lowestMaxima, jt[j]); + break; + } + } + double localDepth = lowestMaxima - jt[i]; + + final double mud = mu1[i] - mu2[i]; + double discriminability = mud * mud / (sigma1[i] * sigma1[i] + sigma2[i] * sigma2[i]); + if(Double.isNaN(discriminability)) { + discriminability = -1; + } + double goodness = localDepth * discriminability; + if(goodness > bestgoodness) { + bestgoodness = goodness; + bestpos = i; + } + } + devPrev = devCur; + } + return new double[] { histogram.getBinMax(bestpos), bestgoodness }; + } + + @Override + protected Logging getLogger() { + return logger; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); + } + + /** + * Class to represent a linear manifold separation + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + private static class Separation { + /** + * Goodness of separation + */ + double goodness = Double.NEGATIVE_INFINITY; + + /** + * Threshold + */ + double threshold = Double.NEGATIVE_INFINITY; + + /** + * Basis of manifold + */ + Matrix basis = null; + + /** + * Origin vector + */ + Vector originV = null; + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Parameter with the maximum dimension to search for + */ + public static final OptionID MAXDIM_ID = OptionID.getOrCreateOptionID("lmclus.maxdim", "Maximum linear manifold dimension to search."); + + /** + * Parameter for the minimum cluster size + */ + public static final OptionID MINSIZE_ID = OptionID.getOrCreateOptionID("lmclus.minsize", "Minimum cluster size to allow."); + + /** + * Sampling intensity level + */ + public static final OptionID SAMPLINGL_ID = OptionID.getOrCreateOptionID("lmclus.sampling-level", "A number used to determine how many samples are taken in each search."); + + /** + * Global significance threshold + */ + public static final OptionID THRESHOLD_ID = OptionID.getOrCreateOptionID("lmclus.threshold", "Threshold to determine if a cluster was found."); + + /** + * Maximum dimensionality to search for + */ + private int maxdim = Integer.MAX_VALUE; + + /** + * Minimum cluster size. + */ + private int minsize; + + /** + * Sampling level + */ + private int samplingLevel; + + /** + * Threshold + */ + private double threshold; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter maxLMDimP = new IntParameter(MAXDIM_ID, new GreaterEqualConstraint(1), true); + if(config.grab(maxLMDimP)) { + maxdim = maxLMDimP.getValue(); + } + IntParameter minsizeP = new IntParameter(MINSIZE_ID, new GreaterEqualConstraint(1)); + if(config.grab(minsizeP)) { + minsize = minsizeP.getValue(); + } + IntParameter samplingLevelP = new IntParameter(SAMPLINGL_ID, 100); + if(config.grab(samplingLevelP)) { + samplingLevel = samplingLevelP.getValue(); + } + DoubleParameter sensivityThresholdP = new DoubleParameter(THRESHOLD_ID); + if(config.grab(sensivityThresholdP)) { + threshold = sensivityThresholdP.getValue(); + } + } + + @Override + protected LMCLUS makeInstance() { + return new LMCLUS(maxdim, minsize, samplingLevel, threshold); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java index 924e1786..eb5608fc 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -50,6 +50,7 @@ import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; import de.lmu.ifi.dbs.elki.math.linearalgebra.SortedEigenPairs; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAResult; import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCARunner; import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil; @@ -420,14 +421,14 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust /** * Returns the union of the two specified clusters. * - * @param database the database holding the objects + * @param relation the database holding the objects * @param distFunc the distance function * @param c1 the first cluster * @param c2 the second cluster * @param dim the dimensionality of the union cluster * @return the union of the two specified clusters */ - private ORCLUSCluster union(Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, ORCLUSCluster c1, ORCLUSCluster c2, int dim) { + private ORCLUSCluster union(Relation<V> relation, DistanceQuery<V, DoubleDistance> distFunc, ORCLUSCluster c1, ORCLUSCluster c2, int dim) { ORCLUSCluster c = new ORCLUSCluster(); c.objectIDs = DBIDUtil.newHashSet(c1.objectIDs); @@ -436,11 +437,13 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust c.objectIDs = DBIDUtil.newArray(c.objectIDs); if(c.objectIDs.size() > 0) { - c.centroid = DatabaseUtil.centroid(database, c.objectIDs); - c.basis = findBasis(database, distFunc, c, dim); + c.centroid = DatabaseUtil.centroid(relation, c.objectIDs); + c.basis = findBasis(relation, distFunc, c, dim); } else { - c.centroid = c1.centroid.plus(c2.centroid).multiplicate(0.5); + V factory = DatabaseUtil.assumeVectorField(relation).getFactory(); + Vector cent = c1.centroid.getColumnVector().plusEquals(c2.centroid.getColumnVector()).timesEquals(0.5); + c.centroid = factory.newNumberVector(cent.getArrayRef()); double[][] doubles = new double[c1.basis.getRowDimensionality()][dim]; for(int i = 0; i < dim; i++) { doubles[i][i] = 1; @@ -460,9 +463,9 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust * @return the projection of double vector o in the subspace of cluster c */ private V projection(ORCLUSCluster c, V o, V factory) { - Matrix o_proj = o.getRowVector().times(c.basis); + Matrix o_proj = o.getColumnVector().transposeTimes(c.basis); double[] values = o_proj.getColumnPackedCopy(); - return factory.newInstance(values); + return factory.newNumberVector(values); } @Override @@ -523,7 +526,7 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust for(int d = 1; d <= o.getDimensionality(); d++) { values[d - 1] = o.doubleValue(d); } - this.centroid = factory.newInstance(values); + this.centroid = factory.newNumberVector(values); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHInterval.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHInterval.java index 6c5db740..46112498 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHInterval.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHInterval.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java index 62ff658f..86e045cb 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/package-info.java index 8cd156e8..8b6d104c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/package-info.java @@ -7,7 +7,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/package-info.java index 82a1f1e1..665de632 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/package-info.java @@ -7,7 +7,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java new file mode 100644 index 00000000..d3c73b53 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java @@ -0,0 +1,310 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +/** + * Abstract base class for k-means implementations. + * + * @author Erich Schubert + * + * @param <V> Vector type + * @param <D> Distance type + */ +public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector<?, ?>, D, Clustering<MeanModel<V>>> { + /** + * Parameter to specify the number of clusters to find, must be an integer + * greater than 0. + */ + public static final OptionID K_ID = OptionID.getOrCreateOptionID("kmeans.k", "The number of clusters to find."); + + /** + * Parameter to specify the number of clusters to find, must be an integer + * greater or equal to 0, where 0 means no limit. + */ + public static final OptionID MAXITER_ID = OptionID.getOrCreateOptionID("kmeans.maxiter", "The maximum number of iterations to do. 0 means no limit."); + + /** + * Parameter to specify the random generator seed. + */ + public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("kmeans.seed", "The random number generator seed."); + + /** + * Parameter to specify the initialization method + */ + public static final OptionID INIT_ID = OptionID.getOrCreateOptionID("kmeans.initialization", "Method to choose the initial means."); + + /** + * Holds the value of {@link #K_ID}. + */ + protected int k; + + /** + * Holds the value of {@link #MAXITER_ID}. + */ + protected int maxiter; + + /** + * Method to choose initial means. + */ + protected KMeansInitialization<V> initializer; + + /** + * Constructor. + * + * @param distanceFunction distance function + * @param k k parameter + * @param maxiter Maxiter parameter + */ + public AbstractKMeans(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { + super(distanceFunction); + this.k = k; + this.maxiter = maxiter; + this.initializer = initializer; + } + + /** + * Returns a list of clusters. The k<sup>th</sup> cluster contains the ids of + * those FeatureVectors, that are nearest to the k<sup>th</sup> mean. + * + * @param relation the database to cluster + * @param means a list of k means + * @param clusters cluster assignment + * @return true when the object was reassigned + */ + protected boolean assignToNearestCluster(Relation<V> relation, List<Vector> means, List<? extends ModifiableDBIDs> clusters) { + boolean changed = false; + + if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { + @SuppressWarnings("unchecked") + final PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>>) getDistanceFunction(); + for(DBID id : relation.iterDBIDs()) { + double mindist = Double.POSITIVE_INFINITY; + V fv = relation.get(id); + int minIndex = 0; + for(int i = 0; i < k; i++) { + double dist = df.doubleDistance(fv, means.get(i)); + if(dist < mindist) { + minIndex = i; + mindist = dist; + } + } + if(clusters.get(minIndex).add(id)) { + changed = true; + // Remove from previous cluster + // TODO: keep a list of cluster assignments to save this search? + for(int i = 0; i < k; i++) { + if(i != minIndex) { + if(clusters.get(i).remove(id)) { + break; + } + } + } + } + } + } + else { + final PrimitiveDistanceFunction<? super NumberVector<?, ?>, D> df = getDistanceFunction(); + for(DBID id : relation.iterDBIDs()) { + D mindist = df.getDistanceFactory().infiniteDistance(); + V fv = relation.get(id); + int minIndex = 0; + for(int i = 0; i < k; i++) { + D dist = df.distance(fv, means.get(i)); + if(dist.compareTo(mindist) < 0) { + minIndex = i; + mindist = dist; + } + } + if(clusters.get(minIndex).add(id)) { + changed = true; + // Remove from previous cluster + // TODO: keep a list of cluster assignments to save this search? + for(int i = 0; i < k; i++) { + if(i != minIndex) { + if(clusters.get(i).remove(id)) { + break; + } + } + } + } + } + } + return changed; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); + } + + /** + * Returns the mean vectors of the given clusters in the given database. + * + * @param clusters the clusters to compute the means + * @param means the recent means + * @param database the database containing the vectors + * @return the mean vectors of the given clusters in the given database + */ + protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<Vector> means, Relation<V> database) { + List<Vector> newMeans = new ArrayList<Vector>(k); + for(int i = 0; i < k; i++) { + ModifiableDBIDs list = clusters.get(i); + Vector mean = null; + for(Iterator<DBID> clusterIter = list.iterator(); clusterIter.hasNext();) { + if(mean == null) { + mean = database.get(clusterIter.next()).getColumnVector(); + } + else { + mean.plusEquals(database.get(clusterIter.next()).getColumnVector()); + } + } + if(list.size() > 0) { + assert mean != null; + mean.timesEquals(1.0 / list.size()); + } + else { + mean = means.get(i); + } + newMeans.add(mean); + } + return newMeans; + } + + /** + * Compute an incremental update for the mean + * + * @param mean Mean to update + * @param vec Object vector + * @param newsize (New) size of cluster + * @param op Cluster size change / Weight change + */ + protected void incrementalUpdateMean(Vector mean, V vec, int newsize, double op) { + if(newsize == 0) { + return; // Keep old mean + } + Vector delta = vec.getColumnVector(); + // Compute difference from mean + delta.minusEquals(mean); + delta.timesEquals(op / newsize); + mean.plusEquals(delta); + } + + /** + * Perform a MacQueen style iteration. + * + * @param relation Relation + * @param means Means + * @param clusters Clusters + * @return true when the means have changed + */ + protected boolean macQueenIterate(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters) { + boolean changed = false; + + if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { + // Raw distance function + @SuppressWarnings("unchecked") + final PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>>) getDistanceFunction(); + + // Incremental update + for(DBID id : relation.iterDBIDs()) { + double mindist = Double.POSITIVE_INFINITY; + V fv = relation.get(id); + int minIndex = 0; + for(int i = 0; i < k; i++) { + double dist = df.doubleDistance(fv, means.get(i)); + if(dist < mindist) { + minIndex = i; + mindist = dist; + } + } + // Update the cluster mean incrementally: + for(int i = 0; i < k; i++) { + ModifiableDBIDs ci = clusters.get(i); + if(i == minIndex) { + if(ci.add(id)) { + incrementalUpdateMean(means.get(i), relation.get(id), ci.size(), +1); + changed = true; + } + } + else if(ci.remove(id)) { + incrementalUpdateMean(means.get(i), relation.get(id), ci.size() + 1, -1); + changed = true; + } + } + } + } + else { + // Raw distance function + final PrimitiveDistanceFunction<? super NumberVector<?, ?>, D> df = getDistanceFunction(); + + // Incremental update + for(DBID id : relation.iterDBIDs()) { + D mindist = df.getDistanceFactory().infiniteDistance(); + V fv = relation.get(id); + int minIndex = 0; + for(int i = 0; i < k; i++) { + D dist = df.distance(fv, means.get(i)); + if(dist.compareTo(mindist) < 0) { + minIndex = i; + mindist = dist; + } + } + // Update the cluster mean incrementally: + for(int i = 0; i < k; i++) { + ModifiableDBIDs ci = clusters.get(i); + if(i == minIndex) { + if(ci.add(id)) { + incrementalUpdateMean(means.get(i), relation.get(id), ci.size(), +1); + changed = true; + } + } + else if(ci.remove(id)) { + incrementalUpdateMean(means.get(i), relation.get(id), ci.size() + 1, -1); + changed = true; + } + } + } + } + return changed; + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java new file mode 100644 index 00000000..b5f088fb --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java @@ -0,0 +1,71 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter; + +/** + * Abstract base class for common k-means initializations. + * + * @author Erich Schubert + * + * @param <V> Vector type + */ +public abstract class AbstractKMeansInitialization<V extends NumberVector<V, ?>> implements KMeansInitialization<V> { + /** + * Holds the value of {@link KMeansLloyd#SEED_ID}. + */ + protected Long seed; + + /** + * Constructor. + * + * @param seed Random seed. + */ + public AbstractKMeansInitialization(Long seed) { + this.seed = seed; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public abstract static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParameterizer { + protected Long seed; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + LongParameter seedP = new LongParameter(AbstractKMeans.SEED_ID, true); + if(config.grab(seedP)) { + seed = seedP.getValue(); + } + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java new file mode 100644 index 00000000..78ccd426 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java @@ -0,0 +1,74 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * Initialize K-means by using the first k objects as initial means. + * + * @author Erich Schubert + * + * @param <V> Vector type + */ +public class FirstKInitialMeans<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization<V> { + /** + * Constructor. + */ + public FirstKInitialMeans() { + super(null); + } + + @Override + public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + Iterator<DBID> iter = relation.iterDBIDs(); + List<Vector> means = new ArrayList<Vector>(k); + for(int i = 0; i < k && iter.hasNext(); i++) { + means.add(relation.get(iter.next()).getColumnVector()); + } + return means; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParameterizer { + @Override + protected FirstKInitialMeans<V> makeInstance() { + return new FirstKInitialMeans<V>(); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java new file mode 100644 index 00000000..f4c0d9c7 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java @@ -0,0 +1,49 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; + +/** + * Interface for initializing K-Means + * + * @author Erich Schubert + * + * @param <V> Vector type + */ +public interface KMeansInitialization<V extends NumberVector<V, ?>> { + /** + * Choose initial means + * + * @param relation Relation + * @param k Parameter k + * @param distanceFunction Distance function + * @return List of chosen means for k-means + */ + public abstract List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction); +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java new file mode 100644 index 00000000..fda1d6c0 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java @@ -0,0 +1,176 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Provides the k-means algorithm, using Lloyd-style bulk iterations. + * + * <p> + * Reference:<br /> + * S. Lloyd<br/> + * Least squares quantization in PCM<br/> + * IEEE Transactions on Information Theory 28 (2)<br/> + * previously published as Bell Telephone Laboratories Paper + * </p> + * + * @author Arthur Zimek + * + * @apiviz.has MeanModel + * + * @param <V> vector datatype + * @param <D> distance value type + */ +@Title("K-Means") +@Description("Finds a partitioning into k clusters.") +@Reference(authors = "S. Lloyd", title = "Least squares quantization in PCM", booktitle = "IEEE Transactions on Information Theory 28 (2): 129–137.", url = "http://dx.doi.org/10.1109/TIT.1982.1056489") +public class KMeansLloyd<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractKMeans<V, D> implements ClusteringAlgorithm<Clustering<MeanModel<V>>> { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(KMeansLloyd.class); + + /** + * Constructor. + * + * @param distanceFunction distance function + * @param k k parameter + * @param maxiter Maxiter parameter + */ + public KMeansLloyd(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { + super(distanceFunction, k, maxiter, initializer); + } + + /** + * Run k-means + * + * @param database Database + * @param relation relation to use + * @return result + * @throws IllegalStateException + */ + public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) throws IllegalStateException { + if(relation.size() <= 0) { + return new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering"); + } + // Choose initial means + List<Vector> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); + // Setup cluster assignment store + List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); + for(int i = 0; i < k; i++) { + clusters.add(DBIDUtil.newHashSet(relation.size() / k)); + } + + for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { + if(logger.isVerbose()) { + logger.verbose("K-Means iteration " + (iteration + 1)); + } + boolean changed = assignToNearestCluster(relation, means, clusters); + // Stop if no cluster assignment changed. + if(!changed) { + break; + } + // Recompute means. + means = means(clusters, means, relation); + } + // Wrap result + final V factory = DatabaseUtil.assumeVectorField(relation).getFactory(); + Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering"); + for(int i = 0; i < clusters.size(); i++) { + MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(means.get(i).getArrayRef())); + result.addCluster(new Cluster<MeanModel<V>>(clusters.get(i), model)); + } + return result; + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?, ?>, D> { + protected int k; + + protected int maxiter; + + protected KMeansInitialization<V> initializer; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter kP = new IntParameter(K_ID, new GreaterConstraint(0)); + if(config.grab(kP)) { + k = kP.getValue(); + } + + ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class); + if(config.grab(initialP)) { + initializer = initialP.instantiateClass(config); + } + + IntParameter maxiterP = new IntParameter(MAXITER_ID, new GreaterEqualConstraint(0), 0); + if(config.grab(maxiterP)) { + maxiter = maxiterP.getValue(); + } + } + + @Override + protected AbstractKMeans<V, D> makeInstance() { + return new KMeansLloyd<V, D>(distanceFunction, k, maxiter, initializer); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java new file mode 100644 index 00000000..56492dd0 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java @@ -0,0 +1,177 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Provides the k-means algorithm, using MacQueen style incremental updates. + * + * <p> + * Reference:<br /> + * J. MacQueen: Some Methods for Classification and Analysis of Multivariate + * Observations. <br /> + * In 5th Berkeley Symp. Math. Statist. Prob., Vol. 1, 1967, pp 281-297. + * </p> + * + * @author Erich Schubert + * + * @apiviz.has MeanModel + * + * @param <V> vector type to use + * @param <D> distance function value type + */ +@Title("K-Means") +@Description("Finds a partitioning into k clusters.") +@Reference(authors = "J. MacQueen", title = "Some Methods for Classification and Analysis of Multivariate Observations", booktitle = "5th Berkeley Symp. Math. Statist. Prob., Vol. 1, 1967, pp 281-297", url = "http://projecteuclid.org/euclid.bsmsp/1200512992") +public class KMeansMacQueen<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractKMeans<V, D> implements ClusteringAlgorithm<Clustering<MeanModel<V>>> { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(KMeansMacQueen.class); + + /** + * Constructor. + * + * @param distanceFunction distance function + * @param k k parameter + * @param maxiter Maxiter parameter + */ + public KMeansMacQueen(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { + super(distanceFunction, k, maxiter, initializer); + } + + /** + * Run k-means + * + * @param database Database + * @param relation relation to use + * @return result + * @throws IllegalStateException + */ + public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) throws IllegalStateException { + if(relation.size() <= 0) { + return new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering"); + } + // Choose initial means + List<Vector> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); + // Initialize cluster and assign objects + List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); + for(int i = 0; i < k; i++) { + clusters.add(DBIDUtil.newHashSet(relation.size() / k)); + } + assignToNearestCluster(relation, means, clusters); + // Initial recomputation of the means. + means = means(clusters, means, relation); + + // Refine result + for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { + if(logger.isVerbose()) { + logger.verbose("K-Means iteration " + (iteration + 1)); + } + boolean changed = macQueenIterate(relation, means, clusters); + if(!changed) { + break; + } + } + final V factory = DatabaseUtil.assumeVectorField(relation).getFactory(); + Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering"); + for(int i = 0; i < clusters.size(); i++) { + DBIDs ids = clusters.get(i); + MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(means.get(i).getArrayRef())); + result.addCluster(new Cluster<MeanModel<V>>(ids, model)); + } + return result; + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?, ?>, D> { + protected int k; + + protected int maxiter; + + protected KMeansInitialization<V> initializer; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter kP = new IntParameter(K_ID, new GreaterConstraint(0)); + if(config.grab(kP)) { + k = kP.getValue(); + } + + ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class); + if(config.grab(initialP)) { + initializer = initialP.instantiateClass(config); + } + + IntParameter maxiterP = new IntParameter(MAXITER_ID, new GreaterEqualConstraint(0), 0); + if(config.grab(maxiterP)) { + maxiter = maxiterP.getValue(); + } + } + + @Override + protected AbstractKMeans<V, D> makeInstance() { + return new KMeansMacQueen<V, D>(distanceFunction, k, maxiter, initializer); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java new file mode 100644 index 00000000..c7a2fa1d --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java @@ -0,0 +1,213 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.logging.LoggingUtil; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; + +/** + * K-Means++ initialization for k-means. + * + * Reference: + * <p> + * D. Arthur, S. Vassilvitskii<br /> + * k-means++: the advantages of careful seeding<br /> + * In: Proc. of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, + * SODA 2007 + * </p> + * + * @author Erich Schubert + * + * @param <V> Vector type + * @param <D> Distance type + */ +@Reference(authors = "D. Arthur, S. Vassilvitskii", title = "k-means++: the advantages of careful seeding", booktitle = "Proc. of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, SODA 2007", url = "http://dx.doi.org/10.1145/1283383.1283494") +public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization<V> { + /** + * Constructor. + * + * @param seed Random seed. + */ + public KMeansPlusPlusInitialMeans(Long seed) { + super(seed); + } + + @Override + public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + // Get a distance query + if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) { + throw new AbortException("K-Means++ initialization can only be used with numerical distances."); + } + @SuppressWarnings("unchecked") + final PrimitiveDistanceFunction<? super V, D> distF = (PrimitiveDistanceFunction<? super V, D>) distanceFunction; + DistanceQuery<V, D> distQ = relation.getDatabase().getDistanceQuery(relation, distF); + + // Chose first mean + List<Vector> means = new ArrayList<Vector>(k); + + Random random = (seed != null) ? new Random(seed) : new Random(); + DBID first = DBIDUtil.randomSample(relation.getDBIDs(), 1, random.nextLong()).iterator().next(); + means.add(relation.get(first).getColumnVector()); + + ModifiableDBIDs chosen = DBIDUtil.newHashSet(k); + chosen.add(first); + ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs()); + // Initialize weights + double[] weights = new double[ids.size()]; + double weightsum = initialWeights(weights, ids, first, distQ); + while(means.size() < k) { + if(weightsum > Double.MAX_VALUE) { + LoggingUtil.warning("Could not choose a reasonable mean for k-means++ - too many data points, too large squared distances?"); + } + if(weightsum < Double.MIN_NORMAL) { + LoggingUtil.warning("Could not choose a reasonable mean for k-means++ - to few data points?"); + } + double r = random.nextDouble() * weightsum; + int pos = 0; + while(r > 0 && pos < weights.length) { + r -= weights[pos]; + pos++; + } + // Add new mean: + DBID newmean = ids.get(pos); + means.add(relation.get(newmean).getColumnVector()); + chosen.add(newmean); + // Update weights: + weights[pos] = 0.0; + // Choose optimized version for double distances, if applicable. + if (distF instanceof PrimitiveDoubleDistanceFunction) { + @SuppressWarnings("unchecked") + PrimitiveDoubleDistanceFunction<V> ddist = (PrimitiveDoubleDistanceFunction<V>) distF; + weightsum = updateWeights(weights, ids, newmean, ddist, relation); + } else { + weightsum = updateWeights(weights, ids, newmean, distQ); + } + } + + return means; + } + + /** + * Initialize the weight list. + * + * @param weights Weight list + * @param ids IDs + * @param latest Added ID + * @param distQ Distance query + * @return Weight sum + */ + protected double initialWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<V, D> distQ) { + double weightsum = 0.0; + DBIDIter it = ids.iter(); + for(int i = 0; i < weights.length; i++, it.advance()) { + DBID id = it.getDBID(); + if(latest.equals(id)) { + weights[i] = 0.0; + } + else { + double d = distQ.distance(latest, id).doubleValue(); + weights[i] = d * d; + } + weightsum += weights[i]; + } + return weightsum; + } + + /** + * Update the weight list. + * + * @param weights Weight list + * @param ids IDs + * @param latest Added ID + * @param distQ Distance query + * @return Weight sum + */ + protected double updateWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<V, D> distQ) { + double weightsum = 0.0; + DBIDIter it = ids.iter(); + for(int i = 0; i < weights.length; i++, it.advance()) { + DBID id = it.getDBID(); + if(weights[i] > 0.0) { + double d = distQ.distance(latest, id).doubleValue(); + weights[i] = Math.min(weights[i], d * d); + weightsum += weights[i]; + } + } + return weightsum; + } + + /** + * Update the weight list. + * + * @param weights Weight list + * @param ids IDs + * @param latest Added ID + * @param distF Distance function + * @return Weight sum + */ + protected double updateWeights(double[] weights, ArrayDBIDs ids, DBID latest, PrimitiveDoubleDistanceFunction<V> distF, Relation<V> rel) { + final V lv = rel.get(latest); + double weightsum = 0.0; + DBIDIter it = ids.iter(); + for(int i = 0; i < weights.length; i++, it.advance()) { + DBID id = it.getDBID(); + if(weights[i] > 0.0) { + double d = distF.doubleDistance(lv, rel.get(id)); + weights[i] = Math.min(weights[i], d * d); + weightsum += weights[i]; + } + } + return weightsum; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector<V, ?>, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization.Parameterizer<V> { + @Override + protected KMeansPlusPlusInitialMeans<V, D> makeInstance() { + return new KMeansPlusPlusInitialMeans<V, D>(seed); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java new file mode 100644 index 00000000..30e59453 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java @@ -0,0 +1,78 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; + +/** + * Initialize K-means by randomly choosing k exsiting elements as cluster + * centers. + * + * @author Erich Schubert + * + * @param <V> Vector type + */ +public class RandomlyChosenInitialMeans<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization<V> { + /** + * Constructor. + * + * @param seed Random seed. + */ + public RandomlyChosenInitialMeans(Long seed) { + super(seed); + } + + @Override + public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + DBIDs ids = DBIDUtil.randomSample(relation.getDBIDs(), k, seed); + List<Vector> means = new ArrayList<Vector>(k); + for(DBID id : ids) { + means.add(relation.get(id).getColumnVector()); + } + return means; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization.Parameterizer<V> { + + @Override + protected RandomlyChosenInitialMeans<V> makeInstance() { + return new RandomlyChosenInitialMeans<V>(seed); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java new file mode 100644 index 00000000..e8a466dd --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java @@ -0,0 +1,87 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.math.MathUtil; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; + +/** + * Initialize k-means by generating random vectors (within the data sets value + * range). + * + * @author Erich Schubert + * + * @param <V> Vector type + */ +public class RandomlyGeneratedInitialMeans<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization<V> { + /** + * Constructor. + * + * @param seed Random seed. + */ + public RandomlyGeneratedInitialMeans(Long seed) { + super(seed); + } + + @Override + public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + final int dim = DatabaseUtil.dimensionality(relation); + Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation); + List<Vector> means = new ArrayList<Vector>(k); + final Random random = (this.seed != null) ? new Random(this.seed) : new Random(); + for(int i = 0; i < k; i++) { + double[] r = MathUtil.randomDoubleArray(dim, random); + // Rescale + for(int d = 0; d < dim; d++) { + r[d] = minmax.first.doubleValue(d + 1) + (minmax.second.doubleValue(d + 1) - minmax.first.doubleValue(d + 1)) * r[d]; + } + means.add(new Vector(r)); + } + return means; + } + + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization.Parameterizer<V> { + + @Override + protected RandomlyGeneratedInitialMeans<V> makeInstance() { + return new RandomlyGeneratedInitialMeans<V>(seed); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java new file mode 100644 index 00000000..2ce625b0 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java @@ -0,0 +1,26 @@ +/** + * <p>K-means clustering and variations.</p> + */ +/* +This file is part of ELKI: +Environment for Developing KDD-Applications Supported by Index-Structures + +Copyright (C) 2012 +Ludwig-Maximilians-Universität München +Lehr- und Forschungseinheit für Datenbanksysteme +ELKI Development Team + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/package-info.java index 660a7a4f..eed031df 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/package-info.java @@ -1,5 +1,5 @@ /** - * <p>Clustering algorithms</p> + * <p>Clustering algorithms.</p> * * Clustering algorithms are supposed to implement the {@link de.lmu.ifi.dbs.elki.algorithm.Algorithm}-Interface. * The more specialized interface {@link de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm} @@ -15,7 +15,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java index dfc4e1cd..e3b274a6 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java index 987c7eda..c4c1687b 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java index 36473cc0..40ab60a8 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java index 92c2248c..3f16e907 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java index 22e9c150..4ca5a564 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java index c0edb2ea..963c0922 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -44,7 +44,7 @@ import de.lmu.ifi.dbs.elki.database.ProxyDatabase; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.AbstractDimensionsSelectingDoubleDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.DimensionsSelectingEuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; @@ -94,7 +94,7 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus /** * The distance function to determine the distance between database objects. * <p> - * Default value: {@link DimensionsSelectingEuclideanDistanceFunction} + * Default value: {@link SubspaceEuclideanDistanceFunction} * </p> * <p> * Key: {@code -subclu.distancefunction} @@ -477,7 +477,7 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - ObjectParameter<AbstractDimensionsSelectingDoubleDistanceFunction<V>> param = new ObjectParameter<AbstractDimensionsSelectingDoubleDistanceFunction<V>>(DISTANCE_FUNCTION_ID, AbstractDimensionsSelectingDoubleDistanceFunction.class, DimensionsSelectingEuclideanDistanceFunction.class); + ObjectParameter<AbstractDimensionsSelectingDoubleDistanceFunction<V>> param = new ObjectParameter<AbstractDimensionsSelectingDoubleDistanceFunction<V>>(DISTANCE_FUNCTION_ID, AbstractDimensionsSelectingDoubleDistanceFunction.class, SubspaceEuclideanDistanceFunction.class); if(config.grab(param)) { distance = param.instantiateClass(config); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java index 1874c9e8..eff71a35 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java index 4b6fa9ad..db687567 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -34,6 +34,7 @@ import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; /** @@ -265,7 +266,7 @@ public class CLIQUEUnit<V extends NumberVector<V, ?>> { resultIntervals.add(this.intervals.last()); resultIntervals.add(other.intervals.last()); - ModifiableDBIDs resultIDs = DBIDUtil.newHashSet(this.ids); + HashSetModifiableDBIDs resultIDs = DBIDUtil.newHashSet(this.ids); resultIDs.retainAll(other.ids); if(resultIDs.size() / all >= tau) { diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java index 444cb0e6..7a686190 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java @@ -7,7 +7,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java index 168ceadb..2a1eb930 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java @@ -10,7 +10,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java index 02350db3..43c6a218 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.trivial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -85,9 +85,7 @@ public class ByLabelClustering extends AbstractAlgorithm<Clustering<Model>> impl public static final OptionID MULTIPLE_ID = OptionID.getOrCreateOptionID("bylabelclustering.multiple", "Flag to indicate that only subspaces with large coverage " + "(i.e. the fraction of the database that is covered by the dense units) " + "are selected, the rest will be pruned."); /** - * Flag to indicate that multiple cluster assignment is possible. If an - * assignment to multiple clusters is desired, the labels indicating the - * clusters need to be separated by blanks. + * Pattern to recognize noise clusters by. */ public static final OptionID NOISE_ID = OptionID.getOrCreateOptionID("bylabelclustering.noise", "Pattern to recognize noise classes by their label."); @@ -144,7 +142,7 @@ public class ByLabelClustering extends AbstractAlgorithm<Clustering<Model>> impl ModifiableDBIDs noiseids = DBIDUtil.newArray(); Clustering<Model> result = new Clustering<Model>("By Label Clustering", "bylabel-clustering"); for(Entry<String, ModifiableDBIDs> entry : labelMap.entrySet()) { - ModifiableDBIDs ids = labelMap.get(entry.getKey()); + ModifiableDBIDs ids = entry.getValue(); if(ids.size() <= 1) { noiseids.addDBIDs(ids); continue; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java index 5b8041d7..228cc7e7 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.trivial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java new file mode 100644 index 00000000..cd45cda2 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java @@ -0,0 +1,163 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.trivial; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.HashMap; +import java.util.Map.Entry; +import java.util.regex.Pattern; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.model.Model; +import de.lmu.ifi.dbs.elki.data.synthetic.bymodel.GeneratorInterface; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter; + +/** + * Pseudo clustering using annotated models. + * + * This "algorithm" puts elements into the same cluster when they agree in their + * model. I.e. it just uses a predefined clustering, and is mostly useful for + * testing and evaluation (e.g. comparing the result of a real algorithm to the + * reference result / golden standard used by the generator). + * + * @author Erich Schubert + * + * @apiviz.uses Model + */ +@Title("Clustering by model") +@Description("Cluster points by a (pre-assigned!) model. For comparing results with a reference clustering.") +public class ByModelClustering extends AbstractAlgorithm<Clustering<Model>> implements ClusteringAlgorithm<Clustering<Model>> { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(ByModelClustering.class); + + /** + * Pattern to recognize noise clusters with + */ + public static final OptionID NOISE_ID = OptionID.getOrCreateOptionID("bymodel.noise", "Pattern to recognize noise models by their label."); + + /** + * Holds the value of {@link #NOISE_ID}. + */ + private Pattern noisepattern = null; + + /** + * Constructor. + * + * @param noisepattern Noise pattern + */ + public ByModelClustering(Pattern noisepattern) { + super(); + this.noisepattern = noisepattern; + } + + /** + * Constructor without parameters + */ + public ByModelClustering() { + this(null); + } + + /** + * Run the actual clustering algorithm. + * + * @param relation The data input we use + */ + public Clustering<Model> run(Relation<Model> relation) { + // Build model mapping + HashMap<Model, ModifiableDBIDs> modelMap = new HashMap<Model, ModifiableDBIDs>(); + for(DBID id : relation.iterDBIDs()) { + Model model = relation.get(id); + ModifiableDBIDs modelids = modelMap.get(model); + if(modelids == null) { + modelids = DBIDUtil.newHashSet(); + modelMap.put(model, modelids); + } + modelids.add(id); + } + + Clustering<Model> result = new Clustering<Model>("By Model Clustering", "bymodel-clustering"); + for(Entry<Model, ModifiableDBIDs> entry : modelMap.entrySet()) { + final Model model = entry.getKey(); + final ModifiableDBIDs ids = entry.getValue(); + final String name = (model instanceof GeneratorInterface) ? ((GeneratorInterface) model).getName() : model.toString(); + Cluster<Model> c = new Cluster<Model>(name, ids, model); + if(noisepattern != null && noisepattern.matcher(name).find()) { + c.setNoise(true); + } + result.addCluster(c); + } + return result; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(TypeUtil.MODEL); + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + protected Pattern noisepat; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + PatternParameter noisepatP = new PatternParameter(NOISE_ID, true); + if(config.grab(noisepatP)) { + noisepat = noisepatP.getValue(); + } + } + + @Override + protected ByModelClustering makeInstance() { + return new ByModelClustering(noisepat); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllInOne.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllInOne.java index a316ce57..2e7d006d 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllInOne.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllInOne.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.trivial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllNoise.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllNoise.java index b85f5445..c497632c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllNoise.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllNoise.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.trivial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/package-info.java index 5629855c..5870a736 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/package-info.java @@ -7,7 +7,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java index 50365996..f0b31d32 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -26,9 +26,6 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.PriorityQueue; import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.data.NumberVector; @@ -38,7 +35,7 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.QueryUtil; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; -import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; @@ -46,6 +43,8 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; @@ -55,10 +54,11 @@ import de.lmu.ifi.dbs.elki.distance.similarityfunction.kernel.PolynomialKernelFu import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; import de.lmu.ifi.dbs.elki.math.MeanVariance; -import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta; import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; +import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.Heap; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; @@ -68,7 +68,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualCons import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; -import de.lmu.ifi.dbs.elki.utilities.pairs.FCPair; +import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleObjPair; /** * Angle-Based Outlier Detection @@ -177,10 +177,10 @@ public class ABOD<V extends NumberVector<V, ?>> extends AbstractDistanceBasedAlg public OutlierResult getRanking(Relation<V> relation, int k) { // Fix a static set of IDs staticids = DBIDUtil.newArray(relation.getDBIDs()); - Collections.sort(staticids); + staticids.sort(); KernelMatrix kernelMatrix = new KernelMatrix(primitiveKernelFunction, relation, staticids); - PriorityQueue<FCPair<Double, DBID>> pq = new PriorityQueue<FCPair<Double, DBID>>(relation.size(), Collections.reverseOrder()); + Heap<DoubleObjPair<DBID>> pq = new Heap<DoubleObjPair<DBID>>(relation.size(), Collections.reverseOrder()); // preprocess kNN neighborhoods assert (k == this.k); @@ -190,7 +190,7 @@ public class ABOD<V extends NumberVector<V, ?>> extends AbstractDistanceBasedAlg MeanVariance s = new MeanVariance(); // System.out.println("Processing: " +objKey); - List<DistanceResultPair<DoubleDistance>> neighbors = knnQuery.getKNNForDBID(objKey, k); + KNNResult<DoubleDistance> neighbors = knnQuery.getKNNForDBID(objKey, k); Iterator<DistanceResultPair<DoubleDistance>> iter = neighbors.iterator(); while(iter.hasNext()) { DBID key1 = iter.next().getDBID(); @@ -214,14 +214,14 @@ public class ABOD<V extends NumberVector<V, ?>> extends AbstractDistanceBasedAlg } // Sample variance probably would be correct, however the numerical // instabilities can actually break ABOD here. - pq.add(new FCPair<Double, DBID>(s.getNaiveVariance(), objKey)); + pq.add(new DoubleObjPair<DBID>(s.getNaiveVariance(), objKey)); } DoubleMinMax minmaxabod = new DoubleMinMax(); - WritableDataStore<Double> abodvalues = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, Double.class); - for(FCPair<Double, DBID> pair : pq) { - abodvalues.put(pair.getSecond(), pair.getFirst()); - minmaxabod.put(pair.getFirst()); + WritableDoubleDataStore abodvalues = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); + for(DoubleObjPair<DBID> pair : pq) { + abodvalues.putDouble(pair.getSecond(), pair.first); + minmaxabod.put(pair.first); } // Build result representation. Relation<Double> scoreResult = new MaterializedRelation<Double>("Angle-based Outlier Degree", "abod-outlier", TypeUtil.DOUBLE, abodvalues, relation.getDBIDs()); @@ -240,16 +240,16 @@ public class ABOD<V extends NumberVector<V, ?>> extends AbstractDistanceBasedAlg public OutlierResult getFastRanking(Relation<V> relation, int k, int sampleSize) { // Fix a static set of IDs staticids = DBIDUtil.newArray(relation.getDBIDs()); - Collections.sort(staticids); + staticids.sort(); KernelMatrix kernelMatrix = new KernelMatrix(primitiveKernelFunction, relation, staticids); - PriorityQueue<FCPair<Double, DBID>> pq = new PriorityQueue<FCPair<Double, DBID>>(relation.size(), Collections.reverseOrder()); + Heap<DoubleObjPair<DBID>> pq = new Heap<DoubleObjPair<DBID>>(relation.size(), Collections.reverseOrder()); // get Candidate Ranking for(DBID aKey : relation.iterDBIDs()) { HashMap<DBID, Double> dists = new HashMap<DBID, Double>(relation.size()); // determine kNearestNeighbors and pairwise distances - PriorityQueue<FCPair<Double, DBID>> nn; + Heap<DoubleObjPair<DBID>> nn; if(!useRNDSample) { nn = calcDistsandNN(relation, kernelMatrix, sampleSize, aKey, dists); } @@ -269,15 +269,15 @@ public class ABOD<V extends NumberVector<V, ?>> extends AbstractDistanceBasedAlg } // getFilter double var = getAbofFilter(kernelMatrix, aKey, dists, counter[1], counter[0], neighbors); - pq.add(new FCPair<Double, DBID>(var, aKey)); + pq.add(new DoubleObjPair<DBID>(var, aKey)); // System.out.println("prog "+(prog++)); } // refine Candidates - PriorityQueue<FCPair<Double, DBID>> resqueue = new PriorityQueue<FCPair<Double, DBID>>(k); + Heap<DoubleObjPair<DBID>> resqueue = new Heap<DoubleObjPair<DBID>>(k); // System.out.println(pq.size() + " objects ordered into candidate list."); // int v = 0; while(!pq.isEmpty()) { - if(resqueue.size() == k && pq.peek().getFirst() > resqueue.peek().getFirst()) { + if(resqueue.size() == k && pq.peek().first > resqueue.peek().first) { break; } // double approx = pq.peek().getFirst(); @@ -313,22 +313,22 @@ public class ABOD<V extends NumberVector<V, ?>> extends AbstractDistanceBasedAlg double var = s.getSampleVariance(); // System.out.println(aKey+ " : " + approx +" " + var); if(resqueue.size() < k) { - resqueue.add(new FCPair<Double, DBID>(var, aKey)); + resqueue.add(new DoubleObjPair<DBID>(var, aKey)); } else { - if(resqueue.peek().getFirst() > var) { + if(resqueue.peek().first > var) { resqueue.remove(); - resqueue.add(new FCPair<Double, DBID>(var, aKey)); + resqueue.add(new DoubleObjPair<DBID>(var, aKey)); } } } // System.out.println(v + " Punkte von " + data.size() + " verfeinert !!"); DoubleMinMax minmaxabod = new DoubleMinMax(); - WritableDataStore<Double> abodvalues = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, Double.class); - for(FCPair<Double, DBID> pair : pq) { - abodvalues.put(pair.getSecond(), pair.getFirst()); - minmaxabod.put(pair.getFirst()); + WritableDoubleDataStore abodvalues = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); + for(DoubleObjPair<DBID> pair : pq) { + abodvalues.putDouble(pair.getSecond(), pair.first); + minmaxabod.put(pair.first); } // Build result representation. Relation<Double> scoreResult = new MaterializedRelation<Double>("Angle-based Outlier Detection", "abod-outlier", TypeUtil.DOUBLE, abodvalues, relation.getDBIDs()); @@ -336,31 +336,6 @@ public class ABOD<V extends NumberVector<V, ?>> extends AbstractDistanceBasedAlg return new OutlierResult(scoreMeta, scoreResult); } - // TODO: remove? - @SuppressWarnings("unused") - private double[] calcNormalization(Integer xKey, HashMap<Integer, Double> dists) { - double[] result = new double[2]; - for(Integer yKey : dists.keySet()) { - if(yKey.equals(xKey)) { - continue; - } - for(Integer zKey : dists.keySet()) { - if(zKey <= yKey) { - continue; - } - if(zKey.equals(xKey)) { - continue; - } - if(dists.get(yKey) != 0 && dists.get(zKey) != 0) { - double sqr = Math.sqrt(dists.get(yKey) * dists.get(zKey)); - result[0] += 1 / sqr; - result[1] += 1 / (dists.get(yKey) * dists.get(zKey) * sqr); - } - } - } - return result; - } - private double[] calcFastNormalization(DBID x, HashMap<DBID, Double> dists) { double[] result = new double[2]; @@ -439,7 +414,7 @@ public class ABOD<V extends NumberVector<V, ?>> extends AbstractDistanceBasedAlg private int mapDBID(DBID aKey) { // TODO: this is not the most efficient... - int off = Collections.binarySearch(staticids, aKey); + int off = staticids.binarySearch(aKey); if(off < 0) { throw new AbortException("Did not find id " + aKey.toString() + " in staticids. " + staticids.contains(aKey)); } @@ -457,33 +432,33 @@ public class ABOD<V extends NumberVector<V, ?>> extends AbstractDistanceBasedAlg return (kernelMatrix.getDistance(ai, ai) + kernelMatrix.getDistance(bi, ci) - kernelMatrix.getDistance(ai, ci) - kernelMatrix.getDistance(ai, bi)); } - private PriorityQueue<FCPair<Double, DBID>> calcDistsandNN(Relation<V> data, KernelMatrix kernelMatrix, int sampleSize, DBID aKey, HashMap<DBID, Double> dists) { - PriorityQueue<FCPair<Double, DBID>> nn = new PriorityQueue<FCPair<Double, DBID>>(sampleSize); + private Heap<DoubleObjPair<DBID>> calcDistsandNN(Relation<V> data, KernelMatrix kernelMatrix, int sampleSize, DBID aKey, HashMap<DBID, Double> dists) { + Heap<DoubleObjPair<DBID>> nn = new Heap<DoubleObjPair<DBID>>(sampleSize); for(DBID bKey : data.iterDBIDs()) { double val = calcCos(kernelMatrix, aKey, bKey); dists.put(bKey, val); if(nn.size() < sampleSize) { - nn.add(new FCPair<Double, DBID>(val, bKey)); + nn.add(new DoubleObjPair<DBID>(val, bKey)); } else { - if(val < nn.peek().getFirst()) { + if(val < nn.peek().first) { nn.remove(); - nn.add(new FCPair<Double, DBID>(val, bKey)); + nn.add(new DoubleObjPair<DBID>(val, bKey)); } } } return nn; } - private PriorityQueue<FCPair<Double, DBID>> calcDistsandRNDSample(Relation<V> data, KernelMatrix kernelMatrix, int sampleSize, DBID aKey, HashMap<DBID, Double> dists) { - PriorityQueue<FCPair<Double, DBID>> nn = new PriorityQueue<FCPair<Double, DBID>>(sampleSize); + private Heap<DoubleObjPair<DBID>> calcDistsandRNDSample(Relation<V> data, KernelMatrix kernelMatrix, int sampleSize, DBID aKey, HashMap<DBID, Double> dists) { + Heap<DoubleObjPair<DBID>> nn = new Heap<DoubleObjPair<DBID>>(sampleSize); int step = (int) ((double) data.size() / (double) sampleSize); int counter = 0; for(DBID bKey : data.iterDBIDs()) { double val = calcCos(kernelMatrix, aKey, bKey); dists.put(bKey, val); if(counter % step == 0) { - nn.add(new FCPair<Double, DBID>(val, bKey)); + nn.add(new DoubleObjPair<DBID>(val, bKey)); } counter++; } @@ -499,13 +474,13 @@ public class ABOD<V extends NumberVector<V, ?>> extends AbstractDistanceBasedAlg public void getExplanations(Relation<V> data) { KernelMatrix kernelMatrix = new KernelMatrix(primitiveKernelFunction, data, staticids); // PQ for Outlier Ranking - PriorityQueue<FCPair<Double, DBID>> pq = new PriorityQueue<FCPair<Double, DBID>>(data.size(), Collections.reverseOrder()); - HashMap<DBID, LinkedList<DBID>> explaintab = new HashMap<DBID, LinkedList<DBID>>(); + Heap<DoubleObjPair<DBID>> pq = new Heap<DoubleObjPair<DBID>>(data.size(), Collections.reverseOrder()); + HashMap<DBID, DBIDs> explaintab = new HashMap<DBID, DBIDs>(); // test all objects for(DBID objKey : data.iterDBIDs()) { MeanVariance s = new MeanVariance(); // Queue for the best explanation - PriorityQueue<FCPair<Double, DBID>> explain = new PriorityQueue<FCPair<Double, DBID>>(); + Heap<DoubleObjPair<DBID>> explain = new Heap<DoubleObjPair<DBID>>(); // determine Object // for each pair of other objects Iterator<DBID> iter = data.iterDBIDs(); @@ -529,13 +504,13 @@ public class ABOD<V extends NumberVector<V, ?>> extends AbstractDistanceBasedAlg s2.put(tmp, 1 / sqr); } } - explain.add(new FCPair<Double, DBID>(s2.getSampleVariance(), key1)); + explain.add(new DoubleObjPair<DBID>(s2.getSampleVariance(), key1)); s.put(s2); } // build variance of the observed vectors - pq.add(new FCPair<Double, DBID>(s.getSampleVariance(), objKey)); + pq.add(new DoubleObjPair<DBID>(s.getSampleVariance(), objKey)); // - LinkedList<DBID> expList = new LinkedList<DBID>(); + ModifiableDBIDs expList = DBIDUtil.newArray(); expList.add(explain.remove().getSecond()); while(!explain.isEmpty()) { DBID nextKey = explain.remove().getSecond(); @@ -564,26 +539,26 @@ public class ABOD<V extends NumberVector<V, ?>> extends AbstractDistanceBasedAlg if(count > 10) { break; } - double factor = pq.peek().getFirst(); + double factor = pq.peek().first; DBID key = pq.remove().getSecond(); System.out.print(data.get(key) + " "); System.out.println(count + " Factor=" + factor + " " + key); - LinkedList<DBID> expList = explaintab.get(key); + DBIDs expList = explaintab.get(key); generateExplanation(data, key, expList); count++; } System.out.println("--------------------------------------------"); } - private void generateExplanation(Relation<V> data, DBID key, LinkedList<DBID> expList) { - V vect1 = data.get(key); + private void generateExplanation(Relation<V> data, DBID key, DBIDs expList) { + Vector vect1 = data.get(key).getColumnVector(); Iterator<DBID> iter = expList.iterator(); while(iter.hasNext()) { System.out.println("Outlier: " + vect1); - V exp = data.get(iter.next()); + Vector exp = data.get(iter.next()).getColumnVector(); System.out.println("Most common neighbor: " + exp); // determine difference Vector - V vals = exp.minus(vect1); + Vector vals = exp.minus(vect1); System.out.println(vals); // System.out.println(new FeatureVector( // "Diff-"+vect1.getPrimaryKey(),vals )); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractAggarwalYuOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractAggarwalYuOutlier.java index 3be73ca6..994ce8e2 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractAggarwalYuOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractAggarwalYuOutlier.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -31,7 +31,7 @@ import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; -import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; @@ -54,8 +54,7 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.IntIntPair; * * <p> * Reference: <br /> - * Outlier detection for high dimensional data Outlier detection for high - * dimensional data <br /> + * Outlier detection for high dimensional data<br /> * C.C. Aggarwal, P. S. Yu<br /> * International Conference on Management of Data Proceedings of the 2001 ACM * SIGMOD international conference on Management of data 2001, Santa Barbara, @@ -147,7 +146,7 @@ public abstract class AbstractAggarwalYuOutlier<V extends NumberVector<?, ?>> ex if(r == phi - 1) { end = size; } - ArrayDBIDs currange = DBIDUtil.newArray(phi + 1); + ArrayModifiableDBIDs currange = DBIDUtil.newArray(phi + 1); for(int i = start; i < end; i++) { currange.add(axis.get(i).second); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractDBOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractDBOutlier.java index 23496389..1d77af3a 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractDBOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractDBOutlier.java @@ -1,37 +1,37 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier;
-/* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures - -Copyright (C) 2011 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ +
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures +
+ Copyright (C) 2011
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team +
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version. +
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details. +
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.datastore.DataStore;
-import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
-import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.ProbabilisticOutlierScore;
@@ -79,10 +79,8 @@ public abstract class AbstractDBOutlier<O, D extends Distance<D>> extends Abstra *
*/
public OutlierResult run(Database database, Relation<O> relation) throws IllegalStateException {
- DistanceQuery<O, D> distFunc = database.getDistanceQuery(relation, getDistanceFunction());
-
// Run the actual score process
- DataStore<Double> dbodscore = computeOutlierScores(database, distFunc, d);
+ DataStore<Double> dbodscore = computeOutlierScores(database, relation, d);
// Build result representation.
Relation<Double> scoreResult = new MaterializedRelation<Double>("Density-Based Outlier Detection", "db-outlier", TypeUtil.DOUBLE, dbodscore, relation.getDBIDs());
@@ -92,8 +90,13 @@ public abstract class AbstractDBOutlier<O, D extends Distance<D>> extends Abstra /**
* computes an outlier score for each object of the database.
+ *
+ * @param database Database
+ * @param relation Relation
+ * @param d distance
+ * @return computed scores
*/
- protected abstract DataStore<Double> computeOutlierScores(Database database, DistanceQuery<O, D> distFunc, D d);
+ protected abstract DataStore<Double> computeOutlierScores(Database database, Relation<O> relation, D d);
@Override
public TypeInformation[] getInputTypeRestriction() {
@@ -108,8 +111,11 @@ public abstract class AbstractDBOutlier<O, D extends Distance<D>> extends Abstra * @apiviz.exclude
*/
public static abstract class Parameterizer<O, D extends Distance<D>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> {
+ /**
+ * Query radius
+ */
protected D d = null;
-
+
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuEvolutionary.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuEvolutionary.java index aba5576e..5d357744 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuEvolutionary.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuEvolutionary.java @@ -1,26 +1,27 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier;
-/* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures - -Copyright (C) 2011 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ +
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures +
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team +
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version. +
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details. +
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
import java.util.ArrayList;
import java.util.Arrays;
@@ -35,13 +36,13 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
-import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
-import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
@@ -65,8 +66,7 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; * detect outliers for high dimensional data.
* <p>
* Reference: <br />
- * Outlier detection for high dimensional data Outlier detection for high
- * dimensional data <br />
+ * Outlier detection for high dimensional data<br />
* C.C. Aggarwal, P. S. Yu <br />
* Proceedings of the 2001 ACM SIGMOD international conference on Management of
* data 2001, Santa Barbara, California, United States
@@ -147,23 +147,23 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?, ?>> extends Abstra Collection<Individuum> individuums = (new EvolutionarySearch(relation, ranges, m, seed)).run();
- WritableDataStore<Double> outlierScore = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, Double.class);
+ WritableDoubleDataStore outlierScore = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
for(Individuum ind : individuums) {
DBIDs ids = computeSubspaceForGene(ind.getGene(), ranges);
double sparsityC = sparsity(ids.size(), dbsize, k);
for(DBID id : ids) {
- Double prev = outlierScore.get(id);
- if(prev == null || sparsityC < prev) {
- outlierScore.put(id, sparsityC);
+ double prev = outlierScore.doubleValue(id);
+ if(Double.isNaN(prev) || sparsityC < prev) {
+ outlierScore.putDouble(id, sparsityC);
}
}
}
DoubleMinMax minmax = new DoubleMinMax();
for(DBID id : relation.iterDBIDs()) {
- Double val = outlierScore.get(id);
- if(val == null) {
- outlierScore.put(id, 0.0);
+ double val = outlierScore.doubleValue(id);
+ if(Double.isNaN(val)) {
+ outlierScore.putDouble(id, 0.0);
val = 0.0;
}
minmax.put(val);
@@ -224,9 +224,10 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?, ?>> extends Abstra this.m = m;
this.dbsize = database.size();
this.dim = DatabaseUtil.dimensionality(database);
- if (seed != null) {
+ if(seed != null) {
this.random = new Random(seed);
- } else {
+ }
+ else {
this.random = new Random();
}
}
@@ -274,7 +275,6 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?, ?>> extends Abstra /**
* check the termination criterion
- *
*/
private boolean checkConvergence(Collection<Individuum> pop) {
if(pop.size() == 0) {
@@ -683,15 +683,15 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?, ?>> extends Abstra }
/**
- * Parameterization class. - * - * @author Erich Schubert - * - * @apiviz.exclude + * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
*/
public static class Parameterizer<V extends NumberVector<?, ?>> extends AbstractAggarwalYuOutlier.Parameterizer {
protected int m = 0;
-
+
protected Long seed = null;
@Override
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuNaive.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuNaive.java index af80c264..190211c3 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuNaive.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuNaive.java @@ -1,26 +1,27 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier;
-/* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures - -Copyright (C) 2011 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ +
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures +
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team +
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version. +
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details. +
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
import java.util.ArrayList;
import java.util.Vector;
@@ -29,13 +30,13 @@ import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
-import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
-import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
@@ -55,8 +56,7 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.IntIntPair; *
* <p>
* Reference: <br />
- * Outlier detection for high dimensional data Outlier detection for high
- * dimensional data <br />
+ * Outlier detection for high dimensional data<br />
* C.C. Aggarwal, P. S. Yu<br />
* International Conference on Management of Data Proceedings of the 2001 ACM
* SIGMOD international conference on Management of data 2001, Santa Barbara,
@@ -140,7 +140,7 @@ public class AggarwalYuNaive<V extends NumberVector<?, ?>> extends AbstractAggar }
}
- WritableDataStore<Double> sparsity = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, Double.class);
+ WritableDoubleDataStore sparsity = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
// calculate the sparsity coefficient
for(Vector<IntIntPair> sub : Rk) {
DBIDs ids = computeSubspace(sub, ranges);
@@ -148,18 +148,18 @@ public class AggarwalYuNaive<V extends NumberVector<?, ?>> extends AbstractAggar if(sparsityC < 0) {
for(DBID id : ids) {
- Double prev = sparsity.get(id);
- if(prev == null || sparsityC < prev) {
- sparsity.put(id, sparsityC);
+ double prev = sparsity.doubleValue(id);
+ if(Double.isNaN(prev) || sparsityC < prev) {
+ sparsity.putDouble(id, sparsityC);
}
}
}
}
DoubleMinMax minmax = new DoubleMinMax();
for(DBID id : relation.iterDBIDs()) {
- Double val = sparsity.get(id);
- if(val == null) {
- sparsity.put(id, 0.0);
+ double val = sparsity.doubleValue(id);
+ if(Double.isNaN(val)) {
+ sparsity.putDouble(id, 0.0);
val = 0.0;
}
minmax.put(val);
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/DBOutlierDetection.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/DBOutlierDetection.java index 0d5f115b..f4b0ba35 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/DBOutlierDetection.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/DBOutlierDetection.java @@ -1,40 +1,41 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier;
-/* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures - -Copyright (C) 2011 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ +
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures +
+ Copyright (C) 2011
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team +
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version. +
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details. +
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
import java.util.Iterator;
-import java.util.List;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.datastore.DataStore;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
-import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery;
-import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
+import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
import de.lmu.ifi.dbs.elki.logging.Logging;
@@ -99,11 +100,14 @@ public class DBOutlierDetection<O, D extends Distance<D>> extends AbstractDBOutl }
@Override
- protected DataStore<Double> computeOutlierScores(Database database, DistanceQuery<O, D> distFunc, D neighborhoodSize) {
+ protected DataStore<Double> computeOutlierScores(Database database, Relation<O> relation, D neighborhoodSize) {
+ DistanceQuery<O, D> distFunc = database.getDistanceQuery(relation, getDistanceFunction());
+ KNNQuery<O, D> knnQuery = database.getKNNQuery(distFunc, DatabaseQuery.HINT_OPTIMIZED_ONLY);
+
// maximum number of objects in the D-neighborhood of an outlier
int m = (int) ((distFunc.getRelation().size()) * (1 - p));
- WritableDataStore<Double> scores = DataStoreUtil.makeStorage(distFunc.getRelation().getDBIDs(), DataStoreFactory.HINT_STATIC, Double.class);
+ WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(distFunc.getRelation().getDBIDs(), DataStoreFactory.HINT_STATIC);
if(logger.isVerbose()) {
logger.verbose("computing outlier flag");
}
@@ -112,21 +116,20 @@ public class DBOutlierDetection<O, D extends Distance<D>> extends AbstractDBOutl int counter = 0;
// if index exists, kNN query. if the distance to the mth nearest neighbor
// is more than d -> object is outlier
- KNNQuery<O, D> knnQuery = database.getKNNQuery(distFunc, m, DatabaseQuery.HINT_OPTIMIZED_ONLY);
if(knnQuery != null) {
for(DBID id : distFunc.getRelation().iterDBIDs()) {
counter++;
- final List<DistanceResultPair<D>> knns = knnQuery.getKNNForDBID(id, m);
+ final KNNResult<D> knns = knnQuery.getKNNForDBID(id, m);
if(logger.isDebugging()) {
logger.debugFine("distance to mth nearest neighbour" + knns.toString());
}
if(knns.get(Math.min(m, knns.size()) - 1).getDistance().compareTo(neighborhoodSize) <= 0) {
// flag as outlier
- scores.put(id, 1.0);
+ scores.putDouble(id, 1.0);
}
else {
// flag as no outlier
- scores.put(id, 0.0);
+ scores.putDouble(id, 0.0);
}
}
if(progressOFlags != null) {
@@ -150,11 +153,11 @@ public class DBOutlierDetection<O, D extends Distance<D>> extends AbstractDBOutl if(count < m) {
// flag as outlier
- scores.put(id, 1.0);
+ scores.putDouble(id, 1.0);
}
else {
// flag as no outlier
- scores.put(id, 0.0);
+ scores.putDouble(id, 0.0);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/DBOutlierScore.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/DBOutlierScore.java index c72675f8..ec83a2a2 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/DBOutlierScore.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/DBOutlierScore.java @@ -1,35 +1,37 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier;
-/* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures - -Copyright (C) 2011 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ +
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures +
+ Copyright (C) 2011
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team +
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version. +
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details. +
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.datastore.DataStore;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
-import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
import de.lmu.ifi.dbs.elki.logging.Logging;
@@ -71,17 +73,18 @@ public class DBOutlierScore<O, D extends Distance<D>> extends AbstractDBOutlier< }
@Override
- protected DataStore<Double> computeOutlierScores(Database database, DistanceQuery<O, D> distFunc, D d) {
- WritableDataStore<Double> scores = DataStoreUtil.makeStorage(distFunc.getRelation().getDBIDs(), DataStoreFactory.HINT_STATIC, Double.class);
+ protected DataStore<Double> computeOutlierScores(Database database, Relation<O> relation, D d) {
+ DistanceQuery<O, D> distFunc = database.getDistanceQuery(relation, getDistanceFunction());
RangeQuery<O, D> rangeQuery = database.getRangeQuery(distFunc);
final double size = distFunc.getRelation().size();
+
+ WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(distFunc.getRelation().getDBIDs(), DataStoreFactory.HINT_STATIC);
// TODO: use bulk when implemented.
for(DBID id : distFunc.getRelation().iterDBIDs()) {
// compute percentage of neighbors in the given neighborhood with size d
double n = (rangeQuery.getRangeForDBID(id, d).size()) / size;
- scores.put(id, 1.0 - n);
+ scores.putDouble(id, 1.0 - n);
}
- scores.toString();
return scores;
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/EMOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/EMOutlier.java index b1464bbb..92d92036 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/EMOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/EMOutlier.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -32,11 +32,11 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
-import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.logging.Logging;
-import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.ProbabilisticOutlierScore;
@@ -89,14 +89,14 @@ public class EMOutlier<V extends NumberVector<V, ?>> extends AbstractAlgorithm<O Clustering<EMModel<V>> emresult = emClustering.run(database, relation);
double globmax = 0.0;
- WritableDataStore<Double> emo_score = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, Double.class);
+ WritableDoubleDataStore emo_score = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT);
for(DBID id : relation.iterDBIDs()) {
double maxProb = Double.POSITIVE_INFINITY;
double[] probs = emClustering.getProbClusterIGivenX(id);
for(double prob : probs) {
maxProb = Math.min(1 - prob, maxProb);
}
- emo_score.put(id, maxProb);
+ emo_score.putDouble(id, maxProb);
globmax = Math.max(maxProb, globmax);
}
Relation<Double> scoreres = new MaterializedRelation<Double>("EM outlier scores", "em-outlier", TypeUtil.DOUBLE, emo_score, relation.getDBIDs());
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianModel.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianModel.java index 13f047e7..ae47c100 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianModel.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianModel.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -28,8 +28,9 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
-import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
@@ -37,7 +38,6 @@ import de.lmu.ifi.dbs.elki.math.MathUtil; import de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
-import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
@@ -95,11 +95,11 @@ public class GaussianModel<V extends NumberVector<V, ?>> extends AbstractAlgorit public OutlierResult run(Relation<V> relation) throws IllegalStateException {
DoubleMinMax mm = new DoubleMinMax();
// resulting scores
- WritableDataStore<Double> oscores = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, Double.class);
+ WritableDoubleDataStore oscores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT);
// Compute mean and covariance Matrix
CovarianceMatrix temp = CovarianceMatrix.make(relation);
- V mean = temp.getMeanVector(relation);
+ Vector mean = temp.getMeanVector(relation).getColumnVector();
// debugFine(mean.toString());
Matrix covarianceMatrix = temp.destroyToNaiveMatrix();
// debugFine(covarianceMatrix.toString());
@@ -110,21 +110,20 @@ public class GaussianModel<V extends NumberVector<V, ?>> extends AbstractAlgorit // for each object compute Mahalanobis distance
for(DBID id : relation.iterDBIDs()) {
- V x = relation.get(id);
- Vector x_minus_mean = x.minus(mean).getColumnVector();
+ Vector x = relation.get(id).getColumnVector().minusEquals(mean);
// Gaussian PDF
- final double mDist = x_minus_mean.transposeTimes(covarianceTransposed).times(x_minus_mean).get(0, 0);
+ final double mDist = x.transposeTimesTimes(covarianceTransposed, x);
final double prob = fakt * Math.exp(-mDist / 2.0);
mm.put(prob);
- oscores.put(id, prob);
+ oscores.putDouble(id, prob);
}
final OutlierScoreMeta meta;
if(invert) {
double max = mm.getMax() != 0 ? mm.getMax() : 1.;
for(DBID id : relation.iterDBIDs()) {
- oscores.put(id, (max - oscores.get(id)) / max);
+ oscores.putDouble(id, (max - oscores.doubleValue(id)) / max);
}
meta = new BasicOutlierScoreMeta(0.0, 1.0);
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianUniformMixture.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianUniformMixture.java index 520c3673..aa352582 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianUniformMixture.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianUniformMixture.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -30,19 +30,19 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
-import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.generic.MaskedDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
import de.lmu.ifi.dbs.elki.math.MathUtil;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
-import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
@@ -137,7 +137,7 @@ public class GaussianUniformMixture<V extends NumberVector<V, ?>> extends Abstra // Positive masked collection
DBIDs anomalousObjs = new MaskedDBIDs(objids, bits, false);
// resulting scores
- WritableDataStore<Double> oscores = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, Double.class);
+ WritableDoubleDataStore oscores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT);
// compute loglikelihood
double logLike = relation.size() * logml + loglikelihoodNormal(normalObjs, relation);
// logger.debugFine("normalsize " + normalObjs.size() + " anormalsize " +
@@ -159,7 +159,7 @@ public class GaussianUniformMixture<V extends NumberVector<V, ?>> extends Abstra // if the loglike increases more than a threshold, object stays in
// anomalous set and is flagged as outlier
final double loglikeGain = currentLogLike - logLike;
- oscores.put(curid, loglikeGain);
+ oscores.putDouble(curid, loglikeGain);
minmax.put(loglikeGain);
if(loglikeGain > c) {
@@ -206,7 +206,7 @@ public class GaussianUniformMixture<V extends NumberVector<V, ?>> extends Abstra return 0;
}
double prob = 0;
- V mean = DatabaseUtil.centroid(database, objids);
+ Vector mean = DatabaseUtil.centroid(database, objids).getColumnVector();
Matrix covarianceMatrix = DatabaseUtil.covarianceMatrix(database, objids);
// test singulaere matrix
@@ -216,10 +216,8 @@ public class GaussianUniformMixture<V extends NumberVector<V, ?>> extends Abstra double fakt = 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, DatabaseUtil.dimensionality(database)) * covarianceDet);
// for each object compute probability and sum
for(DBID id : objids) {
- V x = database.get(id);
-
- Vector x_minus_mean = x.minus(mean).getColumnVector();
- double mDist = x_minus_mean.transposeTimes(covInv).times(x_minus_mean).get(0, 0);
+ Vector x = database.get(id).getColumnVector().minusEquals(mean);
+ double mDist = x.transposeTimesTimes(covInv, x);
prob += Math.log(fakt * Math.exp(-mDist / 2.0));
}
return prob;
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/INFLO.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/INFLO.java index ee4a77ba..083a72a6 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/INFLO.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/INFLO.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -22,8 +22,6 @@ You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import java.util.List;
-
import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
@@ -31,19 +29,20 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery;
-import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
+import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
-import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta;
@@ -133,9 +132,9 @@ public class INFLO<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBa // RNNS
WritableDataStore<ModifiableDBIDs> rnns = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, ModifiableDBIDs.class);
// density
- WritableDataStore<Double> density = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, Double.class);
+ WritableDoubleDataStore density = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT);
// init knns and rnns
- for(DBID id : distFunc.getRelation().iterDBIDs()) {
+ for(DBID id : relation.iterDBIDs()) {
knns.put(id, DBIDUtil.newArray());
rnns.put(id, DBIDUtil.newArray());
}
@@ -149,13 +148,11 @@ public class INFLO<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBa ModifiableDBIDs s;
if(!processedIDs.contains(id)) {
// TODO: use exactly k neighbors?
- List<DistanceResultPair<D>> list = knnQuery.getKNNForDBID(id, k);
- for(DistanceResultPair<D> d : list) {
- knns.get(id).add(d.getDBID());
- }
+ KNNResult<D> list = knnQuery.getKNNForDBID(id, k);
+ knns.get(id).addDBIDs(list.asDBIDs());
processedIDs.add(id);
s = knns.get(id);
- density.put(id, 1 / list.get(k - 1).getDistance().doubleValue());
+ density.putDouble(id, 1 / list.get(k - 1).getDistance().doubleValue());
}
else {
@@ -164,11 +161,9 @@ public class INFLO<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBa for(DBID q : s) {
if(!processedIDs.contains(q)) {
// TODO: use exactly k neighbors?
- List<DistanceResultPair<D>> listQ = knnQuery.getKNNForDBID(q, k);
- for(DistanceResultPair<D> dq : listQ) {
- knns.get(q).add(dq.getDBID());
- }
- density.put(q, 1 / listQ.get(k - 1).getDistance().doubleValue());
+ KNNResult<D> listQ = knnQuery.getKNNForDBID(q, k);
+ knns.get(q).addDBIDs(listQ.asDBIDs());
+ density.putDouble(q, 1 / listQ.getKNNDistance().doubleValue());
processedIDs.add(q);
}
@@ -186,28 +181,28 @@ public class INFLO<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBa // Calculate INFLO for any Object
// IF Object is pruned INFLO=1.0
DoubleMinMax inflominmax = new DoubleMinMax();
- WritableDataStore<Double> inflos = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, Double.class);
- for(DBID id : distFunc.getRelation().iterDBIDs()) {
+ WritableDoubleDataStore inflos = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
+ for(DBID id : relation.iterDBIDs()) {
if(!pruned.contains(id)) {
ModifiableDBIDs knn = knns.get(id);
ModifiableDBIDs rnn = rnns.get(id);
- double denP = density.get(id);
- knn.addAll(rnn);
+ double denP = density.doubleValue(id);
+ knn.addDBIDs(rnn);
double den = 0;
for(DBID q : knn) {
- double denQ = density.get(q);
+ double denQ = density.doubleValue(q);
den = den + denQ;
}
den = den / rnn.size();
den = den / denP;
- inflos.put(id, den);
+ inflos.putDouble(id, den);
// update minimum and maximum
inflominmax.put(den);
}
if(pruned.contains(id)) {
- inflos.put(id, 1.0);
+ inflos.putDouble(id, 1.0);
inflominmax.put(1.0);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/KNNOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/KNNOutlier.java index fa89f954..ee748f99 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/KNNOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/KNNOutlier.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -22,26 +22,24 @@ You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import java.util.List;
-
import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
-import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
-import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
+import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
-import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
@@ -115,15 +113,13 @@ public class KNNOutlier<O, D extends NumberDistance<D, ?>> extends AbstractDista FiniteProgress progressKNNDistance = logger.isVerbose() ? new FiniteProgress("kNN distance for objects", relation.size(), logger) : null;
DoubleMinMax minmax = new DoubleMinMax();
- WritableDataStore<Double> knno_score = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, Double.class);
+ WritableDoubleDataStore knno_score = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
// compute distance to the k nearest neighbor.
- for(DBID id : distanceQuery.getRelation().iterDBIDs()) {
+ for(DBID id : relation.iterDBIDs()) {
// distance to the kth nearest neighbor
- final List<DistanceResultPair<D>> knns = knnQuery.getKNNForDBID(id, k);
- final int last = Math.min(k - 1, knns.size() - 1);
-
- double dkn = knns.get(last).getDistance().doubleValue();
- knno_score.put(id, dkn);
+ final KNNResult<D> knns = knnQuery.getKNNForDBID(id, k);
+ double dkn = knns.getKNNDistance().doubleValue();
+ knno_score.putDouble(id, dkn);
minmax.put(dkn);
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/KNNWeightOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/KNNWeightOutlier.java index 02e0789b..e9657e12 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/KNNWeightOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/KNNWeightOutlier.java @@ -1,28 +1,27 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier;
-/* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures - -Copyright (C) 2011 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ -
-import java.util.List;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures +
+ Copyright (C) 2011
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team +
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version. +
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details. +
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
@@ -30,18 +29,19 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
-import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
+import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
-import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
@@ -118,20 +118,17 @@ public class KNNWeightOutlier<O, D extends NumberDistance<D, ?>> extends Abstrac // compute distance to the k nearest neighbor. n objects with the highest
// distance are flagged as outliers
- WritableDataStore<Double> knnw_score = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, Double.class);
- for(DBID id : distanceQuery.getRelation().iterDBIDs()) {
+ WritableDoubleDataStore knnw_score = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
+ for(DBID id : relation.iterDBIDs()) {
// compute sum of the distances to the k nearest neighbors
- List<DistanceResultPair<D>> knn = knnQuery.getKNNForDBID(id, k);
- D skn = knn.get(0).getDistance();
- final int last = Math.min(k + 1, knn.size());
- for(int i = 1; i < last; i++) {
- skn = skn.plus(knn.get(i).getDistance());
+ final KNNResult<D> knn = knnQuery.getKNNForDBID(id, k);
+ double skn = 0;
+ for(DistanceResultPair<D> r : knn) {
+ skn += r.getDistance().doubleValue();
}
-
- double doubleSkn = skn.getValue().doubleValue();
- knnw_score.put(id, doubleSkn);
- minmax.put(doubleSkn);
+ knnw_score.putDouble(id, skn);
+ minmax.put(skn);
if(progressKNNWeight != null) {
progressKNNWeight.incrementProcessed(logger);
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LDOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LDOF.java index 04ffe8cf..d9256428 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LDOF.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LDOF.java @@ -1,28 +1,27 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier;
-/* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures - -Copyright (C) 2011 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ -
-import java.util.List;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures +
+ Copyright (C) 2011
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team +
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version. +
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details. +
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
@@ -30,18 +29,19 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
-import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
+import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
-import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta;
@@ -117,7 +117,7 @@ public class LDOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas // track the maximum value for normalization
DoubleMinMax ldofminmax = new DoubleMinMax();
// compute the ldof values
- WritableDataStore<Double> ldofs = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, Double.class);
+ WritableDoubleDataStore ldofs = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
// compute LOF_SCORE of each db object
if(logger.isVerbose()) {
@@ -125,17 +125,17 @@ public class LDOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas }
FiniteProgress progressLDOFs = logger.isVerbose() ? new FiniteProgress("LDOF_SCORE for objects", relation.size(), logger) : null;
- for(DBID id : distFunc.getRelation().iterDBIDs()) {
- List<DistanceResultPair<D>> neighbors = knnQuery.getKNNForDBID(id, k);
+ for(DBID id : relation.iterDBIDs()) {
+ KNNResult<D> neighbors = knnQuery.getKNNForDBID(id, k);
int nsize = neighbors.size() - 1;
// skip the point itself
double dxp = 0;
double Dxp = 0;
for(DistanceResultPair<D> neighbor1 : neighbors) {
- if(neighbor1.getDBID() != id) {
+ if(!neighbor1.getDBID().equals(id)) {
dxp += neighbor1.getDistance().doubleValue();
for(DistanceResultPair<D> neighbor2 : neighbors) {
- if(neighbor1.getDBID() != neighbor2.getDBID() && neighbor2.getDBID() != id) {
+ if(!neighbor1.getDBID().equals(neighbor2.getDBID()) && !neighbor2.getDBID().equals(id)) {
Dxp += distFunc.distance(neighbor1.getDBID(), neighbor2.getDBID()).doubleValue();
}
}
@@ -147,7 +147,7 @@ public class LDOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas if(ldof.isNaN() || ldof.isInfinite()) {
ldof = 1.0;
}
- ldofs.put(id, ldof);
+ ldofs.putDouble(id, ldof);
// update maximum
ldofminmax.put(ldof);
@@ -176,11 +176,11 @@ public class LDOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas }
/**
- * Parameterization class. - * - * @author Erich Schubert - * - * @apiviz.exclude + * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
*/
public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> {
protected int k = 0;
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LOCI.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LOCI.java index 57c04be3..cfd8623c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LOCI.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LOCI.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -34,17 +34,19 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; -import de.lmu.ifi.dbs.elki.database.datastore.WritableRecordStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; -import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; +import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.math.MeanVariance; import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta; @@ -185,80 +187,66 @@ public class LOCI<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas } // LOCI main step FiniteProgress progressLOCI = logger.isVerbose() ? new FiniteProgress("LOCI scores", relation.size(), logger) : null; - WritableRecordStore store = DataStoreUtil.makeRecordStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, Double.class, Double.class); - WritableDataStore<Double> mdef_norm = store.getStorage(0, Double.class); - WritableDataStore<Double> mdef_radius = store.getStorage(1, Double.class); + WritableDoubleDataStore mdef_norm = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); + WritableDoubleDataStore mdef_radius = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); + DoubleMinMax minmax = new DoubleMinMax(); + for(DBID id : relation.iterDBIDs()) { + final List<DoubleIntPair> cdist = interestingDistances.get(id); + final double maxdist = cdist.get(cdist.size() - 1).first; + final int maxneig = cdist.get(cdist.size() - 1).second; + double maxmdefnorm = 0.0; double maxnormr = 0; - List<DoubleIntPair> cdist = interestingDistances.get(id); - double maxdist = cdist.get(cdist.size() - 1).first; - int maxneig = cdist.get(cdist.size() - 1).second; if(maxneig >= nmin) { D range = distFunc.getDistanceFactory().fromDouble(maxdist); // Compute the largest neighborhood we will need. List<DistanceResultPair<D>> maxneighbors = rangeQuery.getRangeForDBID(id, range); + // Ensure the set is sorted. Should be a no-op with most indexes. + Collections.sort(maxneighbors); + // For any critical distance, compute the normalized MDEF score. for(DoubleIntPair c : cdist) { - double alpha_r = alpha * c.first; - // compute n(p_i, \alpha * r) from list - int n_alphar = 0; - for(DoubleIntPair c2 : cdist) { - if(c2.first <= alpha_r) { - n_alphar = c2.second; - } - else { - break; - } - } - // compute \hat{n}(p_i, r, \alpha) - double nhat_r_alpha = 0.0; - double sigma_nhat_r_alpha = 0.0; - // Build the sublist from maxneighbors to match the radius c.first - List<DistanceResultPair<D>> rneighbors = null; - for(int i = nmin; i < maxneighbors.size(); i++) { - DistanceResultPair<D> ne = maxneighbors.get(i); - if(ne.getDistance().doubleValue() > c.first) { - rneighbors = maxneighbors.subList(1, i); - break; - } - } - if(rneighbors == null) { + // Only start when minimum size is fulfilled + if (c.second < nmin) { continue; } - for(DistanceResultPair<D> rn : rneighbors) { - List<DoubleIntPair> rncdist = interestingDistances.get(rn.getDBID()); - int rn_alphar = 0; - for(DoubleIntPair c2 : rncdist) { - if(c2.first <= alpha_r) { - rn_alphar = c2.second; - } - else { - break; - } + final double r = c.first; + final double alpha_r = alpha * r; + // compute n(p_i, \alpha * r) from list (note: alpha_r is different from c!) + final int n_alphar = elementsAtRadius(cdist, alpha_r); + // compute \hat{n}(p_i, r, \alpha) and the corresponding \simga_{MDEF} + MeanVariance mv_n_r_alpha = new MeanVariance(); + for(DistanceResultPair<D> ne : maxneighbors) { + // Stop at radius r + if(ne.getDistance().doubleValue() > r) { + break; } - nhat_r_alpha = nhat_r_alpha + rn_alphar; - sigma_nhat_r_alpha = sigma_nhat_r_alpha + (rn_alphar * rn_alphar); + int rn_alphar = elementsAtRadius(interestingDistances.get(ne.getDBID()), alpha_r); + mv_n_r_alpha.put(rn_alphar); } - // finalize average and deviation - nhat_r_alpha = nhat_r_alpha / rneighbors.size(); - sigma_nhat_r_alpha = Math.sqrt(sigma_nhat_r_alpha / rneighbors.size() - nhat_r_alpha * nhat_r_alpha); - double mdef = 1.0 - (n_alphar / nhat_r_alpha); - double sigmamdef = sigma_nhat_r_alpha / nhat_r_alpha; - double mdefnorm = mdef / sigmamdef; + // We only use the average and standard deviation + final double nhat_r_alpha = mv_n_r_alpha.getMean(); + final double sigma_nhat_r_alpha = mv_n_r_alpha.getNaiveStddev(); + + // Redundant divisions removed. + final double mdef = (nhat_r_alpha - n_alphar); // / nhat_r_alpha; + final double sigmamdef = sigma_nhat_r_alpha; // / nhat_r_alpha; + final double mdefnorm = mdef / sigmamdef; if(mdefnorm > maxmdefnorm) { maxmdefnorm = mdefnorm; - maxnormr = c.first; + maxnormr = r; } } } else { - // FIXME: when nmin was never fulfilled - what is the proper value then? + // FIXME: when nmin was not fulfilled - what is the proper value then? maxmdefnorm = 1.0; maxnormr = maxdist; } - mdef_norm.put(id, maxmdefnorm); - mdef_radius.put(id, maxnormr); + mdef_norm.putDouble(id, maxmdefnorm); + mdef_radius.putDouble(id, maxnormr); + minmax.put(maxmdefnorm); if(progressLOCI != null) { progressLOCI.incrementProcessed(logger); } @@ -267,13 +255,34 @@ public class LOCI<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas progressLOCI.ensureCompleted(logger); } Relation<Double> scoreResult = new MaterializedRelation<Double>("LOCI normalized MDEF", "loci-mdef-outlier", TypeUtil.DOUBLE, mdef_norm, relation.getDBIDs()); - // TODO: actually provide min and max? - OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(Double.NaN, Double.NaN, 0.0, Double.POSITIVE_INFINITY, 0.0); + OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(minmax.getMin(), minmax.getMax(), Double.POSITIVE_INFINITY, 0.0); OutlierResult result = new OutlierResult(scoreMeta, scoreResult); result.addChildResult(new MaterializedRelation<Double>("LOCI MDEF Radius", "loci-critical-radius", TypeUtil.DOUBLE, mdef_radius, relation.getDBIDs())); return result; } + /** + * Get the number of objects for a given radius, from the list of critical + * distances, storing (radius, count) pairs. + * + * @param criticalDistances + * @param radius + * @return Number of elements at the given radius + */ + protected int elementsAtRadius(List<DoubleIntPair> criticalDistances, final double radius) { + int n_r = 0; + for(DoubleIntPair c2 : criticalDistances) { + if(c2.first > radius) { + break; + } + if(c2.second != Integer.MIN_VALUE) { + // Update + n_r = c2.second; + } + } + return n_r; + } + @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LOF.java index 5f5f3568..85e1aef2 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LOF.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LOF.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -23,8 +23,6 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier; along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import java.util.List; - import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.data.type.CombinedTypeInformation; @@ -34,13 +32,14 @@ import de.lmu.ifi.dbs.elki.database.QueryUtil; import de.lmu.ifi.dbs.elki.database.datastore.DataStore; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; -import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult; import de.lmu.ifi.dbs.elki.database.query.knn.PreprocessorKNNQuery; import de.lmu.ifi.dbs.elki.database.query.rknn.RKNNQuery; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; @@ -170,8 +169,21 @@ public class LOF<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<Ou } /** + * Constructor. + * + * @param k the value of k + * @param distanceFunction the distance function + * + * Uses the same distance function for neighborhood computation and reachability distance (standard as in the original publication), + * same as {@link #LOF(int, DistanceFunction, DistanceFunction) LOF(int, distanceFunction, distanceFunction)}. + */ + public LOF(int k, DistanceFunction<? super O, D> distanceFunction) { + this(k, distanceFunction, distanceFunction); + } + + /** * Performs the Generalized LOF_SCORE algorithm on the given database by - * calling {@code #doRunInTime(Database)}. + * calling {@link #doRunInTime}. * * @param relation Data to process */ @@ -180,7 +192,7 @@ public class LOF<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<Ou Pair<KNNQuery<O, D>, KNNQuery<O, D>> pair = getKNNQueries(relation, stepprog); KNNQuery<O, D> kNNRefer = pair.getFirst(); KNNQuery<O, D> kNNReach = pair.getSecond(); - return doRunInTime(kNNRefer, kNNReach, stepprog).getResult(); + return doRunInTime(relation.getDBIDs(), kNNRefer, kNNReach, stepprog).getResult(); } /** @@ -231,7 +243,7 @@ public class LOF<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<Ou * function * @param kNNReach the kNN query w.r.t. reachability distance function */ - protected LOFResult<O, D> doRunInTime(KNNQuery<O, D> kNNRefer, KNNQuery<O, D> kNNReach, StepProgress stepprog) throws IllegalStateException { + protected LOFResult<O, D> doRunInTime(DBIDs ids, KNNQuery<O, D> kNNRefer, KNNQuery<O, D> kNNReach, StepProgress stepprog) throws IllegalStateException { // Assert we got something if(kNNRefer == null) { throw new AbortException("No kNN queries supported by database for reference neighborhood distance function."); @@ -244,14 +256,14 @@ public class LOF<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<Ou if(stepprog != null) { stepprog.beginStep(2, "Computing LRDs.", logger); } - WritableDataStore<Double> lrds = computeLRDs(kNNReach.getRelation().getDBIDs(), kNNReach); + WritableDoubleDataStore lrds = computeLRDs(ids, kNNReach); // compute LOF_SCORE of each db object if(stepprog != null) { stepprog.beginStep(3, "Computing LOFs.", logger); } - Pair<WritableDataStore<Double>, DoubleMinMax> lofsAndMax = computeLOFs(kNNRefer.getRelation().getDBIDs(), lrds, kNNRefer); - WritableDataStore<Double> lofs = lofsAndMax.getFirst(); + Pair<WritableDoubleDataStore, DoubleMinMax> lofsAndMax = computeLOFs(ids, lrds, kNNRefer); + WritableDoubleDataStore lofs = lofsAndMax.getFirst(); // track the maximum value for normalization. DoubleMinMax lofminmax = lofsAndMax.getSecond(); @@ -260,7 +272,7 @@ public class LOF<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<Ou } // Build result representation. - Relation<Double> scoreResult = new MaterializedRelation<Double>("Local Outlier Factor", "lof-outlier", TypeUtil.DOUBLE, lofs, kNNRefer.getRelation().getDBIDs()); + Relation<Double> scoreResult = new MaterializedRelation<Double>("Local Outlier Factor", "lof-outlier", TypeUtil.DOUBLE, lofs, ids); OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(lofminmax.getMin(), lofminmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 1.0); OutlierResult result = new OutlierResult(scoreMeta, scoreResult); @@ -275,22 +287,22 @@ public class LOF<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<Ou * reachability distance * @return the LRDs of the objects */ - protected WritableDataStore<Double> computeLRDs(DBIDs ids, KNNQuery<O, D> knnReach) { - WritableDataStore<Double> lrds = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, Double.class); + protected WritableDoubleDataStore computeLRDs(DBIDs ids, KNNQuery<O, D> knnReach) { + WritableDoubleDataStore lrds = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); FiniteProgress lrdsProgress = logger.isVerbose() ? new FiniteProgress("LRD", ids.size(), logger) : null; for(DBID id : ids) { double sum = 0; - List<DistanceResultPair<D>> neighbors = knnReach.getKNNForDBID(id, k); + KNNResult<D> neighbors = knnReach.getKNNForDBID(id, k); int nsize = neighbors.size() - (objectIsInKNN ? 0 : 1); for(DistanceResultPair<D> neighbor : neighbors) { if(objectIsInKNN || !neighbor.getDBID().equals(id)) { - List<DistanceResultPair<D>> neighborsNeighbors = knnReach.getKNNForDBID(neighbor.getDBID(), k); - sum += Math.max(neighbor.getDistance().doubleValue(), neighborsNeighbors.get(neighborsNeighbors.size() - 1).getDistance().doubleValue()); + KNNResult<D> neighborsNeighbors = knnReach.getKNNForDBID(neighbor.getDBID(), k); + sum += Math.max(neighbor.getDistance().doubleValue(), neighborsNeighbors.getKNNDistance().doubleValue()); } } // Avoid division by 0 - Double lrd = (sum > 0) ? nsize / sum : 0.0; - lrds.put(id, lrd); + double lrd = (sum > 0) ? nsize / sum : 0.0; + lrds.putDouble(id, lrd); if(lrdsProgress != null) { lrdsProgress.incrementProcessed(logger); } @@ -310,17 +322,17 @@ public class LOF<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<Ou * reference distance * @return the LOFs of the objects and the maximum LOF */ - protected Pair<WritableDataStore<Double>, DoubleMinMax> computeLOFs(DBIDs ids, DataStore<Double> lrds, KNNQuery<O, D> knnRefer) { - WritableDataStore<Double> lofs = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_STATIC, Double.class); + protected Pair<WritableDoubleDataStore, DoubleMinMax> computeLOFs(DBIDs ids, DataStore<Double> lrds, KNNQuery<O, D> knnRefer) { + WritableDoubleDataStore lofs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC); // track the maximum value for normalization. DoubleMinMax lofminmax = new DoubleMinMax(); FiniteProgress progressLOFs = logger.isVerbose() ? new FiniteProgress("LOF_SCORE for objects", ids.size(), logger) : null; for(DBID id : ids) { double lrdp = lrds.get(id); - final Double lof; + final double lof; if(lrdp > 0) { - List<DistanceResultPair<D>> neighbors = knnRefer.getKNNForDBID(id, k); + final KNNResult<D> neighbors = knnRefer.getKNNForDBID(id, k); int nsize = neighbors.size() - (objectIsInKNN ? 0 : 1); // skip the point itself // neighbors.remove(0); @@ -335,7 +347,7 @@ public class LOF<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<Ou else { lof = 1.0; } - lofs.put(id, lof); + lofs.putDouble(id, lof); // update minimum and maximum lofminmax.put(lof); @@ -346,7 +358,7 @@ public class LOF<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<Ou if(progressLOFs != null) { progressLOFs.ensureCompleted(logger); } - return new Pair<WritableDataStore<Double>, DoubleMinMax>(lofs, lofminmax); + return new Pair<WritableDoubleDataStore, DoubleMinMax>(lofs, lofminmax); } @Override @@ -399,12 +411,12 @@ public class LOF<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<Ou /** * The LRD values of the objects. */ - private final WritableDataStore<Double> lrds; + private final WritableDoubleDataStore lrds; /** * The LOF values of the objects. */ - private final WritableDataStore<Double> lofs; + private final WritableDoubleDataStore lofs; /** * Encapsulates information generated during a run of the {@link LOF} @@ -416,7 +428,7 @@ public class LOF<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<Ou * @param lrds the LRD values of the objects * @param lofs the LOF values of the objects */ - public LOFResult(OutlierResult result, KNNQuery<O, D> kNNRefer, KNNQuery<O, D> kNNReach, WritableDataStore<Double> lrds, WritableDataStore<Double> lofs) { + public LOFResult(OutlierResult result, KNNQuery<O, D> kNNRefer, KNNQuery<O, D> kNNReach, WritableDoubleDataStore lrds, WritableDoubleDataStore lofs) { this.result = result; this.kNNRefer = kNNRefer; this.kNNReach = kNNReach; @@ -441,14 +453,14 @@ public class LOF<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<Ou /** * @return the LRD values of the objects */ - public WritableDataStore<Double> getLrds() { + public WritableDoubleDataStore getLrds() { return lrds; } /** * @return the LOF values of the objects */ - public WritableDataStore<Double> getLofs() { + public WritableDoubleDataStore getLofs() { return lofs; } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LoOP.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LoOP.java index dfb32bb9..f1c273f6 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LoOP.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LoOP.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -23,8 +23,6 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier; along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import java.util.List; - import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; import de.lmu.ifi.dbs.elki.data.type.CombinedTypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; @@ -33,12 +31,13 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.QueryUtil; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; -import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; @@ -48,8 +47,8 @@ import de.lmu.ifi.dbs.elki.index.preprocessed.knn.MaterializeKNNPreprocessor; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; -import de.lmu.ifi.dbs.elki.math.MathUtil; import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.math.statistics.distribution.NormalDistribution; import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; import de.lmu.ifi.dbs.elki.result.outlier.ProbabilisticOutlierScore; @@ -226,19 +225,19 @@ public class LoOP<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<O } // Probabilistic distances - WritableDataStore<Double> pdists = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, Double.class); + WritableDoubleDataStore pdists = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); {// computing PRDs if(stepprog != null) { stepprog.beginStep(3, "Computing pdists", logger); } FiniteProgress prdsProgress = logger.isVerbose() ? new FiniteProgress("pdists", relation.size(), logger) : null; for(DBID id : relation.iterDBIDs()) { - List<DistanceResultPair<D>> neighbors = knnReach.getKNNForDBID(id, kreach); + final KNNResult<D> neighbors = knnReach.getKNNForDBID(id, kreach); double sqsum = 0.0; // use first kref neighbors as reference set int ks = 0; for(DistanceResultPair<D> neighbor : neighbors) { - if(objectIsInKNN || neighbor.getDBID() != id) { + if(objectIsInKNN || !neighbor.getDBID().equals(id)) { double d = neighbor.getDistance().doubleValue(); sqsum += d * d; ks++; @@ -247,15 +246,15 @@ public class LoOP<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<O } } } - Double pdist = lambda * Math.sqrt(sqsum / ks); - pdists.put(id, pdist); + double pdist = lambda * Math.sqrt(sqsum / ks); + pdists.putDouble(id, pdist); if(prdsProgress != null) { prdsProgress.incrementProcessed(logger); } } } // Compute PLOF values. - WritableDataStore<Double> plofs = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, Double.class); + WritableDoubleDataStore plofs = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); MeanVariance mvplof = new MeanVariance(); {// compute LOOP_SCORE of each db object if(stepprog != null) { @@ -264,24 +263,24 @@ public class LoOP<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<O FiniteProgress progressPLOFs = logger.isVerbose() ? new FiniteProgress("PLOFs for objects", relation.size(), logger) : null; for(DBID id : relation.iterDBIDs()) { - List<DistanceResultPair<D>> neighbors = knnComp.getKNNForDBID(id, kcomp); + final KNNResult<D> neighbors = knnComp.getKNNForDBID(id, kcomp); MeanVariance mv = new MeanVariance(); // use first kref neighbors as comparison set. int ks = 0; for(DistanceResultPair<D> neighbor1 : neighbors) { - if(objectIsInKNN || neighbor1.getDBID() != id) { - mv.put(pdists.get(neighbor1.getDBID())); + if(objectIsInKNN || !neighbor1.getDBID().equals(id)) { + mv.put(pdists.doubleValue(neighbor1.getDBID())); ks++; if(ks >= kcomp) { break; } } } - double plof = Math.max(pdists.get(id) / mv.getMean(), 1.0); + double plof = Math.max(pdists.doubleValue(id) / mv.getMean(), 1.0); if(Double.isNaN(plof) || Double.isInfinite(plof)) { plof = 1.0; } - plofs.put(id, plof); + plofs.putDouble(id, plof); mvplof.put((plof - 1.0) * (plof - 1.0)); if(progressPLOFs != null) { @@ -296,7 +295,7 @@ public class LoOP<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<O } // Compute final LoOP values. - WritableDataStore<Double> loops = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, Double.class); + WritableDoubleDataStore loops = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); {// compute LOOP_SCORE of each db object if(stepprog != null) { stepprog.beginStep(5, "Computing LoOP scores", logger); @@ -304,7 +303,7 @@ public class LoOP<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<O FiniteProgress progressLOOPs = logger.isVerbose() ? new FiniteProgress("LoOP for objects", relation.size(), logger) : null; for(DBID id : relation.iterDBIDs()) { - loops.put(id, MathUtil.erf((plofs.get(id) - 1) / (nplof * sqrt2))); + loops.putDouble(id, NormalDistribution.erf((plofs.doubleValue(id) - 1) / (nplof * sqrt2))); if(progressLOOPs != null) { progressLOOPs.incrementProcessed(logger); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OPTICSOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OPTICSOF.java index 369db4d3..2f120c44 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OPTICSOF.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OPTICSOF.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -33,11 +33,13 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
+import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult;
import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
@@ -111,30 +113,31 @@ public class OPTICSOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanc RangeQuery<O, D> rangeQuery = database.getRangeQuery(distQuery);
DBIDs ids = relation.getDBIDs();
- WritableDataStore<List<DistanceResultPair<D>>> nMinPts = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, List.class);
- WritableDataStore<Double> coreDistance = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, Double.class);
+ // FIXME: implicit preprocessor.
+ WritableDataStore<KNNResult<D>> nMinPts = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, KNNResult.class);
+ WritableDoubleDataStore coreDistance = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
WritableDataStore<Integer> minPtsNeighborhoodSize = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, Integer.class);
// Pass 1
// N_minpts(id) and core-distance(id)
for(DBID id : relation.iterDBIDs()) {
- List<DistanceResultPair<D>> minptsNeighbours = knnQuery.getKNNForDBID(id, minpts);
- D d = minptsNeighbours.get(minptsNeighbours.size() - 1).getDistance();
+ KNNResult<D> minptsNeighbours = knnQuery.getKNNForDBID(id, minpts);
+ D d = minptsNeighbours.getKNNDistance();
nMinPts.put(id, minptsNeighbours);
- coreDistance.put(id, d.doubleValue());
+ coreDistance.putDouble(id, d.doubleValue());
minPtsNeighborhoodSize.put(id, rangeQuery.getRangeForDBID(id, d).size());
}
// Pass 2
WritableDataStore<List<Double>> reachDistance = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, List.class);
- WritableDataStore<Double> lrds = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, Double.class);
+ WritableDoubleDataStore lrds = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
for(DBID id : relation.iterDBIDs()) {
List<Double> core = new ArrayList<Double>();
double lrd = 0;
for(DistanceResultPair<D> neighPair : nMinPts.get(id)) {
DBID idN = neighPair.getDBID();
- double coreDist = coreDistance.get(idN);
+ double coreDist = coreDistance.doubleValue(idN);
double dist = distQuery.distance(id, idN).doubleValue();
Double rd = Math.max(coreDist, dist);
lrd = rd + lrd;
@@ -142,22 +145,22 @@ public class OPTICSOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanc }
lrd = (minPtsNeighborhoodSize.get(id) / lrd);
reachDistance.put(id, core);
- lrds.put(id, lrd);
+ lrds.putDouble(id, lrd);
}
// Pass 3
DoubleMinMax ofminmax = new DoubleMinMax();
- WritableDataStore<Double> ofs = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_STATIC, Double.class);
+ WritableDoubleDataStore ofs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC);
for(DBID id : relation.iterDBIDs()) {
double of = 0;
for(DistanceResultPair<D> pair : nMinPts.get(id)) {
DBID idN = pair.getDBID();
- double lrd = lrds.get(id);
- double lrdN = lrds.get(idN);
+ double lrd = lrds.doubleValue(id);
+ double lrdN = lrds.doubleValue(idN);
of = of + lrdN / lrd;
}
of = of / minPtsNeighborhoodSize.get(id);
- ofs.put(id, of);
+ ofs.putDouble(id, of);
// update minimum and maximum
ofminmax.put(of);
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OUTRES.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OUTRES.java new file mode 100644 index 00000000..912f878a --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OUTRES.java @@ -0,0 +1,368 @@ +package de.lmu.ifi.dbs.elki.algorithm.outlier; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.Arrays; +import java.util.BitSet; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.QueryUtil; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; +import de.lmu.ifi.dbs.elki.database.query.DoubleDistanceResultPair; +import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.math.statistics.EpanechnikovKernelDensityFunction; +import de.lmu.ifi.dbs.elki.math.statistics.KernelDensityFunction; +import de.lmu.ifi.dbs.elki.math.statistics.distribution.GammaDistribution; +import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; + +/** + * Adaptive outlierness for subspace outlier ranking (OUTRES). + * + * Note: this algorithm seems to have a O(n^3) complexity without appropriate + * index structures to accelerate range queries: each object in each tested + * subspace will need to know the mean and standard deviation of the density of + * the neighbors, which in turn needs another range query. + * + * Reference: + * <p> + * E. Müller, M. Schiffer, T. Seidl<br /> + * Adaptive outlierness for subspace outlier ranking<br /> + * in: Proc. 19th ACM International Conference on Information and knowledge + * management + * </p> + * + * @author Pleintinger Viktoria + * @author Erich Schubert + */ +@Reference(authors = "E. Müller, M. Schiffer, T. Seidl", title = "Adaptive outlierness for subspace outlier ranking", booktitle = "Proc. 19th ACM International Conference on Information and knowledge management") +public class OUTRES<V extends NumberVector<V, ?>> extends AbstractAlgorithm<OutlierResult> implements OutlierAlgorithm { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(OUTRES.class); + + /** + * The epsilon (in 2d) parameter + */ + private final double eps; + + /** + * Constant for Kolmogorov-Smirnov at alpha=0.01 (table value) + */ + private static final double K_S_CRITICAL001 = 1.63; + + /** + * Constructor. + * + * @param eps Epsilon + */ + public OUTRES(double eps) { + super(); + this.eps = eps; + } + + /** + * Main loop for OUTRES + * + * @param relation Relation to process + * @return Outlier detection result + */ + public OutlierResult run(Relation<V> relation) { + WritableDoubleDataStore ranks = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); + DoubleMinMax minmax = new DoubleMinMax(); + + KernelDensityEstimator kernel = new KernelDensityEstimator(relation); + BitSet subspace = new BitSet(kernel.dim); + + FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("OutRank scores", relation.size(), logger) : null; + + for(DBID object : relation.iterDBIDs()) { + subspace.clear(); + double score = outresScore(0, subspace, object, kernel); + ranks.putDouble(object, score); + minmax.put(score); + if(progress != null) { + progress.incrementProcessed(logger); + } + } + if(progress != null) { + progress.ensureCompleted(logger); + } + + OutlierScoreMeta meta = new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0., 1., 1.); + OutlierResult outresResult = new OutlierResult(meta, new MaterializedRelation<Double>("OUTRES", "outres-score", TypeUtil.DOUBLE, ranks, relation.getDBIDs())); + return outresResult; + } + + /** + * Main loop of OUTRES. Run for each object + * + * @param s start dimension + * @param subspace Current subspace + * @param id Current object ID + * @param kernel Kernel + * @return Score + */ + public double outresScore(final int s, BitSet subspace, DBID id, KernelDensityEstimator kernel) { + double score = 1.0; // Initial score is 1.0 + + for(int i = s; i < kernel.dim; i++) { + if(subspace.get(i)) { // TODO: needed? Or should we always start with i=0? + continue; + } + subspace.set(i); + final SubspaceEuclideanDistanceFunction df = new SubspaceEuclideanDistanceFunction(subspace); + final DoubleDistance range = new DoubleDistance(kernel.adjustedEps(kernel.dim)); + RangeQuery<V, DoubleDistance> rq = QueryUtil.getRangeQuery(kernel.relation, df, range); + + List<DistanceResultPair<DoubleDistance>> neigh = rq.getRangeForDBID(id, range); + if(neigh.size() > 2) { + // Relevance test + if(relevantSubspace(subspace, neigh, kernel)) { + final double density = kernel.subspaceDensity(subspace, neigh); + final double deviation; + // Compute mean and standard deviation for densities of neighbors. + MeanVariance meanv = new MeanVariance(); + for(DistanceResultPair<DoubleDistance> pair : neigh) { + List<DistanceResultPair<DoubleDistance>> n2 = rq.getRangeForDBID(pair.getDBID(), range); + meanv.put(kernel.subspaceDensity(subspace, n2)); + } + deviation = (meanv.getMean() - density) / (2. * meanv.getSampleStddev()); + // High deviation: + if(deviation >= 1) { + score *= (density / deviation); + } + // Recursion + score *= outresScore(i + 1, subspace, id, kernel); + } + } + subspace.clear(i); + } + return score; + } + + /** + * + * @param test: subspace that will be tested about scattering + * @return if the subspace is scattered return will be 0, else 1 + */ + protected boolean relevantSubspace(BitSet subspace, List<DistanceResultPair<DoubleDistance>> neigh, KernelDensityEstimator kernel) { + Relation<V> relation = kernel.relation; + final double crit = K_S_CRITICAL001 / Math.sqrt(neigh.size()); + + for(int dim = subspace.nextSetBit(0); dim > 0; dim = subspace.nextSetBit(dim + 1)) { + // TODO: can we save this copy somehow? + double[] data = new double[neigh.size()]; + { + int count = 0; + for(DistanceResultPair<DoubleDistance> object : neigh) { + V vector = relation.get(object.getDBID()); + data[count] = vector.doubleValue(dim + 1); + count++; + } + assert (count == neigh.size()); + } + Arrays.sort(data); + + final double norm = data[data.length - 1] - data[0]; + final double min = data[0]; + + // Kolmogorow-Smirnow-Test against uniform distribution: + for(int j = 1; j < data.length - 2; j++) { + double delta = (j / (data.length - 1)) - ((data[j] - min) / norm); + if(Math.abs(delta) > crit) { + return false; + } + } + } + return true; + } + + /** + * Kernel density estimation and utility class. + * + * @author Erich Schubert + */ + protected class KernelDensityEstimator { + /** + * Actual kernel in use + */ + final KernelDensityFunction kernel = EpanechnikovKernelDensityFunction.KERNEL; + + /** + * Relation to retrieve data from + */ + final Relation<V> relation; + + /** + * Epsilon values for different subspace dimensionalities + */ + final double[] epsilons; + + /** + * Optimal bandwidth for a dimensionality of 2 + */ + final double hopttwo; + + /** + * Dimensionality of data set + */ + final int dim; + + /** + * Constructor. + * + * @param relation Relation to apply to + */ + public KernelDensityEstimator(Relation<V> relation) { + super(); + this.relation = relation; + dim = DatabaseUtil.dimensionality(relation); + hopttwo = optimalBandwidth(2); + epsilons = new double[dim + 1]; + Arrays.fill(epsilons, Double.NEGATIVE_INFINITY); + epsilons[2] = OUTRES.this.eps; + } + + /** + * Compute density in the given subspace. + * + * @param subspace Subspace + * @param neighbours Neighbor distance list + * @return Density + */ + protected double subspaceDensity(BitSet subspace, List<DistanceResultPair<DoubleDistance>> neighbours) { + final double bandwidth = optimalBandwidth(subspace.cardinality()); + + // TODO: optimize by moving instanceof outside? + double density = 0; + for(DistanceResultPair<DoubleDistance> pair : neighbours) { + if(pair instanceof DoubleDistanceResultPair) { + density += kernel.density(((DoubleDistanceResultPair) pair).getDoubleDistance() / bandwidth); + } + else { + density += kernel.density(pair.getDistance().doubleValue() / bandwidth); + } + } + + return density / relation.size(); + } + + /** + * Compute optimal kernel bandwidth + * + * @param dim Dimensionality of subspace + * @return optimal bandwidth + */ + protected double optimalBandwidth(int dim) { + // Pi in the publication is redundant and cancels out! + double hopt = 8 * Math.exp(GammaDistribution.logGamma(dim / 2.0 + 1)) * (dim + 4) * Math.pow(2, dim); + return hopt * Math.pow(relation.size(), (-1 / (dim + 4))); + } + + /** + * Rescale the query radius based on the given dimensionality. + * + * @param dim Dimensionality + * @return Query radius + */ + protected double adjustedEps(int dim) { + // Cached + double e = epsilons[dim]; + if(e < 0) { + e = epsilons[2] * optimalBandwidth(dim) / hopttwo; + epsilons[dim] = e; + } + return e; + } + } + + @Override + protected Logging getLogger() { + return logger; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); + } + + /** + * Parameterization class. + * + * @author Viktoria Pleintinger + * + * @apiviz.exclude + */ + public static class Parameterizer<O extends NumberVector<O, ?>> extends AbstractParameterizer { + /** + * Option ID for Epsilon parameter + */ + public static final OptionID D_ID = OptionID.getOrCreateOptionID("outres.epsilon", "Range value for OUTRES in 2 dimensions."); + + /** + * Query radius + */ + protected double eps; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + final DoubleParameter param = new DoubleParameter(D_ID); + if(config.grab(param)) { + eps = param.getValue(); + } + } + + @Override + protected OUTRES<O> makeInstance() { + return new OUTRES<O>(eps); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OnlineLOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OnlineLOF.java index a5115fdf..ad17398c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OnlineLOF.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OnlineLOF.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -26,7 +26,7 @@ import java.util.List; import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.database.QueryUtil;
-import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
@@ -37,6 +37,7 @@ import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
+import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult;
import de.lmu.ifi.dbs.elki.database.query.knn.PreprocessorKNNQuery;
import de.lmu.ifi.dbs.elki.database.query.rknn.RKNNQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
@@ -101,14 +102,14 @@ public class OnlineLOF<O, D extends NumberDistance<D, ?>> extends LOF<O, D> { RKNNQuery<O, D> rkNNRefer = queries.getSecond().getFirst();
RKNNQuery<O, D> rkNNReach = queries.getSecond().getSecond();
- LOFResult<O, D> lofResult = super.doRunInTime(kNNRefer, kNNReach, stepprog);
+ LOFResult<O, D> lofResult = super.doRunInTime(relation.getDBIDs(), kNNRefer, kNNReach, stepprog);
lofResult.setRkNNRefer(rkNNRefer);
lofResult.setRkNNReach(rkNNReach);
// add listener
KNNListener l = new LOFKNNListener(lofResult);
- ((MaterializeKNNPreprocessor<O, D>)((PreprocessorKNNQuery<O, D>) lofResult.getKNNRefer()).getPreprocessor()).addKNNListener(l);
- ((MaterializeKNNPreprocessor<O, D>)((PreprocessorKNNQuery<O, D>) lofResult.getKNNReach()).getPreprocessor()).addKNNListener(l);
+ ((MaterializeKNNPreprocessor<O, D>)((PreprocessorKNNQuery<O, D, ? extends KNNResult<D>>) lofResult.getKNNRefer()).getPreprocessor()).addKNNListener(l);
+ ((MaterializeKNNPreprocessor<O, D>)((PreprocessorKNNQuery<O, D, ? extends KNNResult<D>>) lofResult.getKNNReach()).getPreprocessor()).addKNNListener(l);
return lofResult.getResult();
}
@@ -194,8 +195,8 @@ public class OnlineLOF<O, D extends NumberDistance<D, ?>> extends LOF<O, D> { @Override
public void kNNsChanged(KNNChangeEvent e) {
- AbstractMaterializeKNNPreprocessor<O, D> p1 = ((PreprocessorKNNQuery<O, D>) lofResult.getKNNRefer()).getPreprocessor();
- AbstractMaterializeKNNPreprocessor<O, D> p2 = ((PreprocessorKNNQuery<O, D>) lofResult.getKNNReach()).getPreprocessor();
+ AbstractMaterializeKNNPreprocessor<O, D, ?> p1 = ((PreprocessorKNNQuery<O, D, ?>) lofResult.getKNNRefer()).getPreprocessor();
+ AbstractMaterializeKNNPreprocessor<O, D, ?> p2 = ((PreprocessorKNNQuery<O, D, ?>) lofResult.getKNNReach()).getPreprocessor();
if(firstEventReceived == null) {
if(e.getSource().equals(p1) && e.getSource().equals(p2)) {
@@ -266,13 +267,13 @@ public class OnlineLOF<O, D extends NumberDistance<D, ?>> extends LOF<O, D> { ArrayDBIDs lrd_ids = DBIDUtil.ensureArray(DBIDUtil.union(insertions, updates2));
List<List<DistanceResultPair<D>>> reachDistRKNNs = lofResult.getRkNNReach().getRKNNForBulkDBIDs(lrd_ids, k);
ArrayDBIDs affected_lrd_id_candidates = mergeIDs(reachDistRKNNs, lrd_ids);
- ArrayDBIDs affected_lrd_ids = DBIDUtil.newArray(affected_lrd_id_candidates.size());
- WritableDataStore<Double> new_lrds = computeLRDs(affected_lrd_id_candidates, lofResult.getKNNReach());
+ ArrayModifiableDBIDs affected_lrd_ids = DBIDUtil.newArray(affected_lrd_id_candidates.size());
+ WritableDoubleDataStore new_lrds = computeLRDs(affected_lrd_id_candidates, lofResult.getKNNReach());
for(DBID id : affected_lrd_id_candidates) {
- Double new_lrd = new_lrds.get(id);
- Double old_lrd = lofResult.getLrds().get(id);
- if(old_lrd == null || !old_lrd.equals(new_lrd)) {
- lofResult.getLrds().put(id, new_lrd);
+ double new_lrd = new_lrds.doubleValue(id);
+ double old_lrd = lofResult.getLrds().doubleValue(id);
+ if(Double.isNaN(old_lrd) || old_lrd != new_lrd) {
+ lofResult.getLrds().putDouble(id, new_lrd);
affected_lrd_ids.add(id);
}
}
@@ -325,13 +326,13 @@ public class OnlineLOF<O, D extends NumberDistance<D, ?>> extends LOF<O, D> { ArrayDBIDs lrd_ids = DBIDUtil.ensureArray(updates2);
List<List<DistanceResultPair<D>>> reachDistRKNNs = lofResult.getRkNNReach().getRKNNForBulkDBIDs(lrd_ids, k);
ArrayDBIDs affected_lrd_id_candidates = mergeIDs(reachDistRKNNs, lrd_ids);
- ArrayDBIDs affected_lrd_ids = DBIDUtil.newArray(affected_lrd_id_candidates.size());
- WritableDataStore<Double> new_lrds = computeLRDs(affected_lrd_id_candidates, lofResult.getKNNReach());
+ ArrayModifiableDBIDs affected_lrd_ids = DBIDUtil.newArray(affected_lrd_id_candidates.size());
+ WritableDoubleDataStore new_lrds = computeLRDs(affected_lrd_id_candidates, lofResult.getKNNReach());
for(DBID id : affected_lrd_id_candidates) {
- Double new_lrd = new_lrds.get(id);
- Double old_lrd = lofResult.getLrds().get(id);
- if(!old_lrd.equals(new_lrd)) {
- lofResult.getLrds().put(id, new_lrd);
+ double new_lrd = new_lrds.doubleValue(id);
+ double old_lrd = lofResult.getLrds().doubleValue(id);
+ if(old_lrd != new_lrd) {
+ lofResult.getLrds().putDouble(id, new_lrd);
affected_lrd_ids.add(id);
}
}
@@ -364,7 +365,7 @@ public class OnlineLOF<O, D extends NumberDistance<D, ?>> extends LOF<O, D> { * ids
*/
private ArrayModifiableDBIDs mergeIDs(List<List<DistanceResultPair<D>>> queryResults, DBIDs... ids) {
- ModifiableDBIDs result = DBIDUtil.newTreeSet();
+ ModifiableDBIDs result = DBIDUtil.newHashSet();
for(DBIDs dbids : ids) {
result.addDBIDs(dbids);
}
@@ -383,10 +384,10 @@ public class OnlineLOF<O, D extends NumberDistance<D, ?>> extends LOF<O, D> { * @param lofResult the result of the former LOF run
*/
private void recomputeLOFs(DBIDs ids, LOFResult<O, D> lofResult) {
- Pair<WritableDataStore<Double>, DoubleMinMax> lofsAndMax = computeLOFs(ids, lofResult.getLrds(), lofResult.getKNNRefer());
- WritableDataStore<Double> new_lofs = lofsAndMax.getFirst();
+ Pair<WritableDoubleDataStore, DoubleMinMax> lofsAndMax = computeLOFs(ids, lofResult.getLrds(), lofResult.getKNNRefer());
+ WritableDoubleDataStore new_lofs = lofsAndMax.getFirst();
for(DBID id : ids) {
- lofResult.getLofs().put(id, new_lofs.get(id));
+ lofResult.getLofs().putDouble(id, new_lofs.doubleValue(id));
}
// track the maximum value for normalization.
DoubleMinMax new_lofminmax = lofsAndMax.getSecond();
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OutlierAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OutlierAlgorithm.java index fb0a89d5..2b122183 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OutlierAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OutlierAlgorithm.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ReferenceBasedOutlierDetection.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ReferenceBasedOutlierDetection.java index e7895bad..befd03ed 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ReferenceBasedOutlierDetection.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ReferenceBasedOutlierDetection.java @@ -1,26 +1,27 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier;
-/* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures - -Copyright (C) 2011 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ +
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures +
+ Copyright (C) 2011
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team +
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version. +
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details. +
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
import java.util.ArrayList;
import java.util.Collection;
@@ -35,17 +36,18 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
-import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair;
import de.lmu.ifi.dbs.elki.database.query.GenericDistanceResultPair;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
-import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
+import de.lmu.ifi.dbs.elki.math.Mean;
import de.lmu.ifi.dbs.elki.result.ReferencePointsResult;
import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
@@ -53,6 +55,7 @@ import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -144,45 +147,51 @@ public class ReferenceBasedOutlierDetection<V extends NumberVector<?, ?>, D exte DBIDs ids = relation.getDBIDs();
// storage of distance/score values.
- WritableDataStore<Double> rbod_score = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_STATIC, Double.class);
- // compute density for one reference point, to initialize the first density
- // value for each object
+ WritableDoubleDataStore rbod_score = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC | DataStoreFactory.HINT_HOT);
- double density = 0;
- V firstRef = refPoints.iterator().next();
- // compute distance vector for the first reference point
- List<DistanceResultPair<D>> firstReferenceDists = computeDistanceVector(firstRef, relation, distFunc);
- // order ascending
- Collections.sort(firstReferenceDists);
- for(int l = 0; l < firstReferenceDists.size(); l++) {
- density = computeDensity(firstReferenceDists, l);
- rbod_score.put(firstReferenceDists.get(l).getDBID(), density);
- }
- // compute density values for all remaining reference points
- for(V refPoint : refPoints) {
- List<DistanceResultPair<D>> referenceDists = computeDistanceVector(refPoint, relation, distFunc);
- // order ascending
- Collections.sort(referenceDists);
- // compute density value for each object
- for(int l = 0; l < referenceDists.size(); l++) {
- density = computeDensity(referenceDists, l);
- if(density < rbod_score.get(referenceDists.get(l).getDBID())) {
- rbod_score.put(referenceDists.get(l).getDBID(), density);
+ // Compute density estimation:
+ {
+ // compute density for one reference point, to initialize the first
+ // density
+ // value for each object, then update
+ final Iterator<V> iter = refPoints.iterator();
+ if(!iter.hasNext()) {
+ throw new AbortException("Cannot compute ROS without reference points!");
+ }
+ V firstRef = iter.next();
+ // compute distance vector for the first reference point
+ List<DistanceResultPair<D>> firstReferenceDists = computeDistanceVector(firstRef, relation, distFunc);
+ for(int l = 0; l < firstReferenceDists.size(); l++) {
+ double density = computeDensity(firstReferenceDists, l);
+ // Initial value
+ rbod_score.putDouble(firstReferenceDists.get(l).getDBID(), density);
+ }
+ // compute density values for all remaining reference points
+ while(iter.hasNext()) {
+ V refPoint = iter.next();
+ List<DistanceResultPair<D>> referenceDists = computeDistanceVector(refPoint, relation, distFunc);
+ // compute density value for each object
+ for(int l = 0; l < referenceDists.size(); l++) {
+ double density = computeDensity(referenceDists, l);
+ // Update minimum
+ if(density < rbod_score.doubleValue(referenceDists.get(l).getDBID())) {
+ rbod_score.putDouble(referenceDists.get(l).getDBID(), density);
+ }
}
}
}
// compute maximum density
double maxDensity = 0.0;
for(DBID id : relation.iterDBIDs()) {
- double dens = rbod_score.get(id);
+ double dens = rbod_score.doubleValue(id);
if(dens > maxDensity) {
maxDensity = dens;
}
}
- // compute REFOD_SCORE
+ // compute ROS
for(DBID id : relation.iterDBIDs()) {
- double score = 1 - (rbod_score.get(id) / maxDensity);
- rbod_score.put(id, score);
+ double score = 1 - (rbod_score.doubleValue(id) / maxDensity);
+ rbod_score.putDouble(id, score);
}
// adds reference points to the result. header information for the
@@ -207,13 +216,13 @@ public class ReferenceBasedOutlierDetection<V extends NumberVector<?, ?>, D exte * database object and the object id
*/
protected List<DistanceResultPair<D>> computeDistanceVector(V refPoint, Relation<V> database, DistanceQuery<V, D> distFunc) {
+ // TODO: optimize for double distances?
List<DistanceResultPair<D>> referenceDists = new ArrayList<DistanceResultPair<D>>(database.size());
- int counter = 0;
- for(Iterator<DBID> iter = database.iterDBIDs(); iter.hasNext(); counter++) {
- DBID id = iter.next();
- DistanceResultPair<D> referenceDist = new GenericDistanceResultPair<D>(distFunc.distance(id, refPoint), id);
- referenceDists.add(counter, referenceDist);
+ for(DBID id : database.iterDBIDs()) {
+ final D distance = distFunc.distance(id, refPoint);
+ referenceDists.add(new GenericDistanceResultPair<D>(distance, id));
}
+ Collections.sort(referenceDists);
return referenceDists;
}
@@ -230,51 +239,53 @@ public class ReferenceBasedOutlierDetection<V extends NumberVector<?, ?>, D exte * @return density for one object and reference point
*/
protected double computeDensity(List<DistanceResultPair<D>> referenceDists, int index) {
- double density = 0.0;
- DistanceResultPair<D> x = referenceDists.get(index);
- double xDist = x.getDistance().doubleValue();
+ final DistanceResultPair<D> x = referenceDists.get(index);
+ final double xDist = x.getDistance().doubleValue();
- int j = 0;
- int n = index - 1;
- int m = index + 1;
- while(j < k) {
- double mdist = 0;
- double ndist = 0;
- if(n >= 0) {
- ndist = referenceDists.get(n).getDistance().doubleValue();
- if(m < referenceDists.size()) {
- mdist = referenceDists.get(m).getDistance().doubleValue();
- if(Math.abs(ndist - xDist) < Math.abs(mdist - xDist)) {
- density += Math.abs(ndist - xDist);
- n--;
- j++;
- }
- else {
- density += Math.abs(mdist - xDist);
- m++;
- j++;
- }
+ int lef = index - 1;
+ int rig = index + 1;
+ Mean mean = new Mean();
+ double lef_d = (lef >= 0) ? referenceDists.get(lef).getDistance().doubleValue() : Double.NEGATIVE_INFINITY;
+ double rig_d = (rig < referenceDists.size()) ? referenceDists.get(rig).getDistance().doubleValue() : Double.NEGATIVE_INFINITY;
+ while(mean.getCount() < k) {
+ if(lef >= 0 && rig < referenceDists.size()) {
+ // Prefer n or m?
+ if(Math.abs(lef_d - xDist) < Math.abs(rig_d - xDist)) {
+ mean.put(Math.abs(lef_d - xDist));
+ // Update n
+ lef--;
+ lef_d = (lef >= 0) ? referenceDists.get(lef).getDistance().doubleValue() : Double.NEGATIVE_INFINITY;
}
else {
- density += Math.abs(ndist - xDist);
- n--;
- j++;
+ mean.put(Math.abs(rig_d - xDist));
+ // Update right
+ rig++;
+ rig_d = (rig < referenceDists.size()) ? referenceDists.get(rig).getDistance().doubleValue() : Double.NEGATIVE_INFINITY;
}
}
- else if(m < referenceDists.size()) {
- mdist = referenceDists.get(m).getDistance().doubleValue();
- density += Math.abs(mdist - xDist);
- m++;
- j++;
- }
else {
- throw new IndexOutOfBoundsException();
+ if(lef >= 0) {
+ // Choose left, since right is not available.
+ mean.put(Math.abs(lef_d - xDist));
+ // update left
+ lef--;
+ lef_d = (lef >= 0) ? referenceDists.get(lef).getDistance().doubleValue() : Double.NEGATIVE_INFINITY;
+ }
+ else if(rig < referenceDists.size()) {
+ // Choose right, since left is not available
+ mean.put(Math.abs(rig_d - xDist));
+ // Update right
+ rig++;
+ rig_d = (rig < referenceDists.size()) ? referenceDists.get(rig).getDistance().doubleValue() : Double.NEGATIVE_INFINITY;
+ }
+ else {
+ // Not enough objects in database?
+ throw new IndexOutOfBoundsException();
+ }
}
}
- double densityDegree = 1.0 / ((1.0 / k) * density);
-
- return densityDegree;
+ return 1.0 / mean.getMean();
}
@Override
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/SOD.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/SOD.java index 5e7184a3..a09bbcfd 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/SOD.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/SOD.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -35,16 +35,17 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; -import de.lmu.ifi.dbs.elki.database.query.DoubleDistanceResultPair; import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.DimensionsSelectingEuclideanDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; -import de.lmu.ifi.dbs.elki.distance.distancevalue.IntegerDistance; +import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.distance.similarityfunction.SharedNearestNeighborSimilarityFunction; +import de.lmu.ifi.dbs.elki.distance.similarityfunction.SimilarityFunction; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; @@ -54,10 +55,9 @@ import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; import de.lmu.ifi.dbs.elki.result.textwriter.TextWriteable; import de.lmu.ifi.dbs.elki.result.textwriter.TextWriterStream; -import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil; import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; -import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.KNNHeap; -import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.KNNList; +import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.Heap; +import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.TiedTopBoundedHeap; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; @@ -69,6 +69,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstrain import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; +import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleObjPair; /** * @author Arthur Zimek @@ -82,7 +84,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @Title("SOD: Subspace outlier degree") @Description("Outlier Detection in Axis-Parallel Subspaces of High Dimensional Data") @Reference(authors = "H.-P. Kriegel, P. Kröger, E. Schubert, A. Zimek", title = "Outlier Detection in Axis-Parallel Subspaces of High Dimensional Data", booktitle = "Proceedings of the 13th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD), Bangkok, Thailand, 2009", url = "http://dx.doi.org/10.1007/978-3-642-01307-2") -public class SOD<V extends NumberVector<V, ?>> extends AbstractAlgorithm<OutlierResult> implements OutlierAlgorithm { +public class SOD<V extends NumberVector<V, ?>, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<OutlierResult> implements OutlierAlgorithm { /** * The logger for this class. */ @@ -93,7 +95,7 @@ public class SOD<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Outlier * considered for learning the subspace properties., must be an integer * greater than 0. */ - public static final OptionID KNN_ID = OptionID.getOrCreateOptionID("sod.knn", "The number of shared nearest neighbors to be considered for learning the subspace properties."); + public static final OptionID KNN_ID = OptionID.getOrCreateOptionID("sod.knn", "The number of most snn-similar objects to use as reference set for learning the subspace properties."); /** * Parameter to indicate the multiplier for the discriminance value for @@ -102,6 +104,11 @@ public class SOD<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Outlier public static final OptionID ALPHA_ID = OptionID.getOrCreateOptionID("sod.alpha", "The multiplier for the discriminance value for discerning small from large variances."); /** + * Parameter for the similarity function. + */ + public static final OptionID SIM_ID = OptionID.getOrCreateOptionID("sod.similarity", "The similarity function used for the neighborhood set."); + + /** * Holds the value of {@link #KNN_ID}. */ private int knn; @@ -112,9 +119,9 @@ public class SOD<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Outlier private double alpha; /** - * The similarity function. + * The similarity function {@link #SIM_ID}. */ - private SharedNearestNeighborSimilarityFunction<V> similarityFunction; + private SimilarityFunction<V, D> similarityFunction; /** * Constructor with parameters. @@ -123,7 +130,7 @@ public class SOD<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Outlier * @param alpha Alpha parameter * @param similarityFunction Shared nearest neighbor similarity function */ - public SOD(int knn, double alpha, SharedNearestNeighborSimilarityFunction<V> similarityFunction) { + public SOD(int knn, double alpha, SimilarityFunction<V, D> similarityFunction) { super(); this.knn = knn; this.alpha = alpha; @@ -136,7 +143,7 @@ public class SOD<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Outlier * @param relation Data relation to process */ public OutlierResult run(Relation<V> relation) throws IllegalStateException { - SimilarityQuery<V, IntegerDistance> snnInstance = similarityFunction.instantiate(relation); + SimilarityQuery<V, D> snnInstance = similarityFunction.instantiate(relation); FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Assigning Subspace Outlier Degree", relation.size(), logger) : null; WritableDataStore<SODModel<?>> sod_models = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, SODModel.class); DoubleMinMax minmax = new DoubleMinMax(); @@ -145,7 +152,7 @@ public class SOD<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Outlier if(progress != null) { progress.incrementProcessed(logger); } - DBIDs knnList = getKNN(relation, snnInstance, queryObject).asDBIDs(); + DBIDs knnList = getNearestNeighbors(relation, snnInstance, queryObject); SODModel<V> model = new SODModel<V>(relation, knnList, alpha, relation.get(queryObject)); sod_models.put(queryObject, model); minmax.put(model.getSod()); @@ -168,23 +175,30 @@ public class SOD<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Outlier * <p/> * The query object is excluded from the knn list. * - * @param database the database holding the objects - * @param snnInstance similarity function + * @param relation the database holding the objects + * @param simQ similarity function * @param queryObject the query object for which the kNNs should be determined * @return the k nearest neighbors in terms of the shared nearest neighbor * distance without the query object */ - private KNNList<DoubleDistance> getKNN(Relation<V> database, SimilarityQuery<V, IntegerDistance> snnInstance, DBID queryObject) { + private DBIDs getNearestNeighbors(Relation<V> relation, SimilarityQuery<V, D> simQ, DBID queryObject) { // similarityFunction.getPreprocessor().getParameters(); - KNNHeap<DoubleDistance> kNearestNeighbors = new KNNHeap<DoubleDistance>(knn, new DoubleDistance(Double.POSITIVE_INFINITY)); - for(Iterator<DBID> iter = database.iterDBIDs(); iter.hasNext();) { - DBID id = iter.next(); + Heap<DoubleObjPair<DBID>> nearestNeighbors = new TiedTopBoundedHeap<DoubleObjPair<DBID>>(knn); + for(DBID id : relation.iterDBIDs()) { if(!id.equals(queryObject)) { - double distance = 1.0 / snnInstance.similarity(queryObject, id).doubleValue(); - kNearestNeighbors.add(new DoubleDistanceResultPair(distance, id)); + double sim = simQ.similarity(queryObject, id).doubleValue(); + if(sim > 0) { + nearestNeighbors.add(new DoubleObjPair<DBID>(sim, id)); + } } } - return kNearestNeighbors.toKNNList(); + // Collect DBIDs + ArrayModifiableDBIDs dbids = DBIDUtil.newArray(nearestNeighbors.size()); + while(nearestNeighbors.size() > 0) { + final DoubleObjPair<DBID> next = nearestNeighbors.poll(); + dbids.add(next.second); + } + return dbids; } @Override @@ -201,13 +215,13 @@ public class SOD<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Outlier * * * @author Arthur Zimek - * @param <O> the type of DatabaseObjects handled by this Result + * @param <V> the type of DatabaseObjects handled by this Result */ // TODO: arthur comment - public static class SODModel<O extends NumberVector<O, ?>> implements TextWriteable, Comparable<SODModel<?>> { + public static class SODModel<V extends NumberVector<V, ?>> implements TextWriteable, Comparable<SODModel<?>> { private double[] centerValues; - private O center; + private V center; private double[] variances; @@ -220,61 +234,71 @@ public class SOD<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Outlier /** * Initialize SOD Model * - * @param database Database + * @param relation Database * @param neighborhood Neighborhood * @param alpha Alpha value * @param queryObject Query object */ - public SODModel(Relation<O> database, DBIDs neighborhood, double alpha, O queryObject) { - // TODO: store database link? - centerValues = new double[DatabaseUtil.dimensionality(database)]; - variances = new double[centerValues.length]; - for(DBID id : neighborhood) { - O databaseObject = database.get(id); - for(int d = 0; d < centerValues.length; d++) { - centerValues[d] += databaseObject.doubleValue(d + 1); + public SODModel(Relation<V> relation, DBIDs neighborhood, double alpha, V queryObject) { + if(neighborhood.size() > 0) { + // TODO: store database link? + centerValues = new double[DatabaseUtil.dimensionality(relation)]; + variances = new double[centerValues.length]; + for(DBID id : neighborhood) { + V databaseObject = relation.get(id); + for(int d = 0; d < centerValues.length; d++) { + centerValues[d] += databaseObject.doubleValue(d + 1); + } } - } - for(int d = 0; d < centerValues.length; d++) { - centerValues[d] /= neighborhood.size(); - } - for(DBID id : neighborhood) { - O databaseObject = database.get(id); for(int d = 0; d < centerValues.length; d++) { - // distance - double distance = centerValues[d] - databaseObject.doubleValue(d + 1); - // variance - variances[d] += distance * distance; + centerValues[d] /= neighborhood.size(); } - } - expectationOfVariance = 0; - for(int d = 0; d < variances.length; d++) { - variances[d] /= neighborhood.size(); - expectationOfVariance += variances[d]; - } - expectationOfVariance /= variances.length; - weightVector = new BitSet(variances.length); - for(int d = 0; d < variances.length; d++) { - if(variances[d] < alpha * expectationOfVariance) { - weightVector.set(d, true); + for(DBID id : neighborhood) { + V databaseObject = relation.get(id); + for(int d = 0; d < centerValues.length; d++) { + // distance + double distance = centerValues[d] - databaseObject.doubleValue(d + 1); + // variance + variances[d] += distance * distance; + } } + expectationOfVariance = 0; + for(int d = 0; d < variances.length; d++) { + variances[d] /= neighborhood.size(); + expectationOfVariance += variances[d]; + } + expectationOfVariance /= variances.length; + weightVector = new BitSet(variances.length); + for(int d = 0; d < variances.length; d++) { + if(variances[d] < alpha * expectationOfVariance) { + weightVector.set(d, true); + } + } + center = DatabaseUtil.assumeVectorField(relation).getFactory().newNumberVector(centerValues); + sod = subspaceOutlierDegree(queryObject, center, weightVector); + } + else { + center = queryObject; + sod = 0.0; } - center = DatabaseUtil.assumeVectorField(database).getFactory().newInstance(centerValues); - sod = subspaceOutlierDegree(queryObject, center, weightVector); } /** - * + * Compute SOD score * * @param queryObject * @param center * @param weightVector * @return sod value */ - private double subspaceOutlierDegree(O queryObject, O center, BitSet weightVector) { - final DimensionsSelectingEuclideanDistanceFunction df = new DimensionsSelectingEuclideanDistanceFunction(weightVector); + private double subspaceOutlierDegree(V queryObject, V center, BitSet weightVector) { + final SubspaceEuclideanDistanceFunction df = new SubspaceEuclideanDistanceFunction(weightVector); + final int card = weightVector.cardinality(); + if(card == 0) { + return 0; + } double distance = df.distance(queryObject, center).doubleValue(); - distance /= weightVector.cardinality(); + distance /= card; return distance; } @@ -316,7 +340,7 @@ public class SOD<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Outlier * Model result this is a proxy for. */ Relation<SODModel<?>> models; - + /** * The IDs we are defined for */ @@ -326,7 +350,7 @@ public class SOD<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Outlier * Constructor. * * @param models Models result - * @param dbids IDs we are defined for + * @param dbids IDs we are defined for */ public SODProxyScoreResult(Relation<SODModel<?>> models, DBIDs dbids) { super(); @@ -353,7 +377,7 @@ public class SOD<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Outlier public DBIDs getDBIDs() { return dbids; } - + @Override public IterableIterator<DBID> iterDBIDs() { return IterableUtil.fromIterator(dbids.iterator()); @@ -402,7 +426,7 @@ public class SOD<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Outlier * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParameterizer { + public static class Parameterizer<V extends NumberVector<V, ?>, D extends NumberDistance<D, ?>> extends AbstractParameterizer { /** * Holds the value of {@link #KNN_ID}. */ @@ -414,14 +438,19 @@ public class SOD<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Outlier private double alpha = 1.1; /** - * The similarity function. + * The similarity function - {@link #SIM_ID}. */ - private SharedNearestNeighborSimilarityFunction<V> similarityFunction; + private SimilarityFunction<V, D> similarityFunction; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - final IntParameter knnP = new IntParameter(KNN_ID, new GreaterConstraint(0), 1); + final ObjectParameter<SimilarityFunction<V, D>> simP = new ObjectParameter<SimilarityFunction<V, D>>(SIM_ID, SimilarityFunction.class, SharedNearestNeighborSimilarityFunction.class); + if(config.grab(simP)) { + similarityFunction = simP.instantiateClass(config); + } + + final IntParameter knnP = new IntParameter(KNN_ID, new GreaterConstraint(0)); if(config.grab(knnP)) { knn = knnP.getValue(); } @@ -430,14 +459,11 @@ public class SOD<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Outlier if(config.grab(alphaP)) { alpha = alphaP.getValue(); } - - Class<SharedNearestNeighborSimilarityFunction<V>> cls = ClassGenericsUtil.uglyCastIntoSubclass(SharedNearestNeighborSimilarityFunction.class); - similarityFunction = config.tryInstantiate(cls); } @Override - protected SOD<V> makeInstance() { - return new SOD<V>(knn, alpha, similarityFunction); + protected SOD<V, D> makeInstance() { + return new SOD<V, D>(knn, alpha, similarityFunction); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/ExternalDoubleOutlierScore.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/ExternalDoubleOutlierScore.java index 5a4503fd..22447454 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/ExternalDoubleOutlierScore.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/ExternalDoubleOutlierScore.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.meta; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -39,14 +39,14 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; -import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.datasource.parser.AbstractParser; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; -import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta; import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta; import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; @@ -140,7 +140,7 @@ public class ExternalDoubleOutlierScore extends AbstractAlgorithm<OutlierResult> * @return Result */ public OutlierResult run(Database database, Relation<?> relation) { - WritableDataStore<Double> scores = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, Double.class); + WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); Pattern colSep = Pattern.compile(AbstractParser.WHITESPACE_PATTERN); DoubleMinMax minmax = new DoubleMinMax(); @@ -156,7 +156,7 @@ public class ExternalDoubleOutlierScore extends AbstractAlgorithm<OutlierResult> else if(line.length() > 0) { String[] cols = colSep.split(line); Integer id = null; - Double score = null; + double score = Double.NaN; for(String str : cols) { Matcher mi = idpattern.matcher(str); Matcher ms = scorepattern.matcher(str); @@ -172,17 +172,17 @@ public class ExternalDoubleOutlierScore extends AbstractAlgorithm<OutlierResult> id = Integer.parseInt(str.substring(mi.end())); } if(msf) { - if(score != null) { + if(!Double.isNaN(score)) { throw new AbortException("Score pattern matched twice: previous value " + score + " second value: " + str); } score = Double.parseDouble(str.substring(ms.end())); } } - if(id != null && score != null) { - scores.put(DBIDUtil.importInteger(id), score); + if(id != null && !Double.isNaN(score)) { + scores.putDouble(DBIDUtil.importInteger(id), score); minmax.put(score); } - else if(id == null && score == null) { + else if(id == null && Double.isNaN(score)) { logger.warning("Line did not match either ID nor score nor comment: " + line); } else { @@ -213,7 +213,7 @@ public class ExternalDoubleOutlierScore extends AbstractAlgorithm<OutlierResult> for(DBID id : relation.iterDBIDs()) { double val = scoresult.get(id); // scores.get(id); val = scaling.getScaled(val); - scores.put(id, val); + scores.putDouble(id, val); mm.put(val); } meta = new BasicOutlierScoreMeta(mm.getMin(), mm.getMax()); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/FeatureBagging.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/FeatureBagging.java index d3e738a1..c8da9501 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/FeatureBagging.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/FeatureBagging.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.meta; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -25,7 +25,6 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.meta; import java.util.ArrayList; import java.util.BitSet; -import java.util.HashMap; import java.util.Random; import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; @@ -36,11 +35,11 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; -import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.DimensionsSelectingEuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; @@ -60,6 +59,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameteriz import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter; +import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; /** * A simple ensemble method called "Feature bagging" for outlier detection. @@ -144,8 +144,8 @@ public class FeatureBagging extends AbstractAlgorithm<OutlierResult> implements FiniteProgress prog = logger.isVerbose() ? new FiniteProgress("LOF iterations", num, logger) : null; for(int i = 0; i < num; i++) { BitSet dimset = randomSubspace(dbdim, mindim, maxdim); - DimensionsSelectingEuclideanDistanceFunction df = new DimensionsSelectingEuclideanDistanceFunction(dimset); - LOF<NumberVector<?, ?>, DoubleDistance> lof = new LOF<NumberVector<?, ?>, DoubleDistance>(k, df, df); + SubspaceEuclideanDistanceFunction df = new SubspaceEuclideanDistanceFunction(dimset); + LOF<NumberVector<?, ?>, DoubleDistance> lof = new LOF<NumberVector<?, ?>, DoubleDistance>(k, df); // run LOF and collect the result OutlierResult result = lof.run(relation); @@ -159,28 +159,34 @@ public class FeatureBagging extends AbstractAlgorithm<OutlierResult> implements } } - WritableDataStore<Double> scores = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, Double.class); + WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); DoubleMinMax minmax = new DoubleMinMax(); if(breadth) { FiniteProgress cprog = logger.isVerbose() ? new FiniteProgress("Combining results", relation.size(), logger) : null; - HashMap<IterableIterator<DBID>, Relation<Double>> IDVectorOntoScoreVector = new HashMap<IterableIterator<DBID>, Relation<Double>>(); + Pair<IterableIterator<DBID>, Relation<Double>>[] IDVectorOntoScoreVector = Pair.newPairArray(results.size()); // Mapping score-sorted DBID-Iterators onto their corresponding scores. // We need to initialize them now be able to iterate them "in parallel". - for(OutlierResult r : results) { - IDVectorOntoScoreVector.put(r.getOrdering().iter(relation.getDBIDs()), r.getScores()); + { + int i = 0; + for(OutlierResult r : results) { + IDVectorOntoScoreVector[i] = new Pair<IterableIterator<DBID>, Relation<Double>>(r.getOrdering().iter(relation.getDBIDs()), r.getScores()); + i++; + } } // Iterating over the *lines* of the AS_t(i)-matrix. for(int i = 0; i < relation.size(); i++) { // Iterating over the elements of a line (breadth-first). - for(IterableIterator<DBID> iter : IDVectorOntoScoreVector.keySet()) { - if(iter.hasNext()) { // Always true if every algorithm returns a - // complete result (one score for every DBID). + for(Pair<IterableIterator<DBID>, Relation<Double>> pair : IDVectorOntoScoreVector) { + IterableIterator<DBID> iter = pair.first; + // Always true if every algorithm returns a complete result (one score + // for every DBID). + if(iter.hasNext()) { DBID tmpID = iter.next(); - double score = IDVectorOntoScoreVector.get(iter).get(tmpID); - if(scores.get(tmpID) == null) { - scores.put(tmpID, score); + double score = pair.second.get(tmpID); + if(Double.isNaN(scores.doubleValue(tmpID))) { + scores.putDouble(tmpID, score); minmax.put(score); } } @@ -202,9 +208,12 @@ public class FeatureBagging extends AbstractAlgorithm<OutlierResult> implements for(DBID id : relation.iterDBIDs()) { double sum = 0.0; for(OutlierResult r : results) { - sum += r.getScores().get(id); + final Double s = r.getScores().get(id); + if (s != null && !Double.isNaN(s)) { + sum += s; + } } - scores.put(id, sum); + scores.putDouble(id, sum); minmax.put(sum); if(cprog != null) { cprog.incrementProcessed(logger); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/RescaleMetaOutlierAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/RescaleMetaOutlierAlgorithm.java index 18f62549..9634cd59 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/RescaleMetaOutlierAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/RescaleMetaOutlierAlgorithm.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.meta; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -33,12 +33,12 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; -import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; -import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.result.Result; import de.lmu.ifi.dbs.elki.result.ResultUtil; import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta; @@ -102,13 +102,13 @@ public class RescaleMetaOutlierAlgorithm extends AbstractAlgorithm<OutlierResult ((OutlierScalingFunction) scaling).prepare(or); } - WritableDataStore<Double> scaledscores = DataStoreUtil.makeStorage(scores.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, Double.class); + WritableDoubleDataStore scaledscores = DataStoreUtil.makeDoubleStorage(scores.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); DoubleMinMax minmax = new DoubleMinMax(); for(DBID id : scores.iterDBIDs()) { double val = scores.get(id); val = scaling.getScaled(val); - scaledscores.put(id, val); + scaledscores.putDouble(id, val); minmax.put(val); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/package-info.java index 6565e144..d7e78281 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/package-info.java @@ -5,7 +5,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/package-info.java index 372383f8..ea5d3ec4 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/package-info.java @@ -9,7 +9,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/AbstractDistanceBasedSpatialOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/AbstractDistanceBasedSpatialOutlier.java index 79fae09d..1caf7582 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/AbstractDistanceBasedSpatialOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/AbstractDistanceBasedSpatialOutlier.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/AbstractNeighborhoodOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/AbstractNeighborhoodOutlier.java index 4e1e0fa1..f0c05e1e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/AbstractNeighborhoodOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/AbstractNeighborhoodOutlier.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuGLSBackwardSearchAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuGLSBackwardSearchAlgorithm.java index e8a7415b..b4070e0c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuGLSBackwardSearchAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuGLSBackwardSearchAlgorithm.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -23,9 +23,6 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial; along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import java.util.Collections; -import java.util.List; - import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm; import de.lmu.ifi.dbs.elki.data.NumberVector; @@ -34,13 +31,14 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.QueryUtil; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; -import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; -import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.ProxyView; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -48,8 +46,8 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; -import de.lmu.ifi.dbs.elki.math.MathUtil; import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; +import de.lmu.ifi.dbs.elki.math.statistics.distribution.NormalDistribution; import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta; import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; @@ -123,7 +121,7 @@ public class CTLuGLSBackwardSearchAlgorithm<V extends NumberVector<?, ?>, D exte * @return Algorithm result */ public OutlierResult run(Relation<V> relationx, Relation<? extends NumberVector<?, ?>> relationy) { - WritableDataStore<Double> scores = DataStoreUtil.makeStorage(relationx.getDBIDs(), DataStoreFactory.HINT_STATIC, Double.class); + WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relationx.getDBIDs(), DataStoreFactory.HINT_STATIC); DoubleMinMax mm = new DoubleMinMax(0.0, 0.0); // Outlier detection loop @@ -131,14 +129,14 @@ public class CTLuGLSBackwardSearchAlgorithm<V extends NumberVector<?, ?>, D exte ModifiableDBIDs idview = DBIDUtil.newHashSet(relationx.getDBIDs()); ProxyView<V> proxy = new ProxyView<V>(relationx.getDatabase(), idview, relationx); - double phialpha = MathUtil.standardNormalProbit(1.0 - alpha / 2); + double phialpha = NormalDistribution.standardNormalProbit(1.0 - alpha / 2); // Detect outliers while significant. while(true) { Pair<DBID, Double> candidate = singleIteration(proxy, relationy); if(candidate.second < phialpha) { break; } - scores.put(candidate.first, candidate.second); + scores.putDouble(candidate.first, candidate.second); if (!Double.isNaN(candidate.second)) { mm.put(candidate.second); } @@ -147,7 +145,7 @@ public class CTLuGLSBackwardSearchAlgorithm<V extends NumberVector<?, ?>, D exte // Remaining objects are inliers for(DBID id : idview) { - scores.put(id, 0.0); + scores.putDouble(id, 0.0); } } @@ -170,9 +168,9 @@ public class CTLuGLSBackwardSearchAlgorithm<V extends NumberVector<?, ?>, D exte KNNQuery<V, D> knnQuery = QueryUtil.getKNNQuery(relationx, getDistanceFunction(), k + 1); // We need stable indexed DBIDs - ArrayDBIDs ids = DBIDUtil.newArray(relationx.getDBIDs()); + ArrayModifiableDBIDs ids = DBIDUtil.newArray(relationx.getDBIDs()); // Sort, so we can do a binary search below. - Collections.sort(ids); + ids.sort(); // init F,X,Z Matrix X = new Matrix(ids.size(), 6); @@ -203,7 +201,7 @@ public class CTLuGLSBackwardSearchAlgorithm<V extends NumberVector<?, ?>, D exte // Fill the neighborhood matrix F: { - List<DistanceResultPair<D>> neighbors = knnQuery.getKNNForDBID(id, k + 1); + KNNResult<D> neighbors = knnQuery.getKNNForDBID(id, k + 1); ModifiableDBIDs neighborhood = DBIDUtil.newArray(neighbors.size()); for(DistanceResultPair<D> dpair : neighbors) { if(id.equals(dpair.getDBID())) { @@ -216,7 +214,7 @@ public class CTLuGLSBackwardSearchAlgorithm<V extends NumberVector<?, ?>, D exte final int nweight = -1 / neighborhood.size(); // We need to find the index positions of the neighbors, unfortunately. for(DBID nid : neighborhood) { - int pos = Collections.binarySearch(ids, nid); + int pos = ids.binarySearch(nid); assert (pos >= 0); F.set(pos, i, nweight); } @@ -239,7 +237,7 @@ public class CTLuGLSBackwardSearchAlgorithm<V extends NumberVector<?, ?>, D exte double worstscore = Double.NEGATIVE_INFINITY; for(int i = 0; i < ids.size(); i++) { DBID id = ids.get(i); - double err = E.getRowVector(i).euclideanLength(); + double err = E.getRow(i).euclideanLength(); // double err = Math.abs(E.get(i, 0)); if(err > worstscore) { worstscore = err; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMeanMultipleAttributes.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMeanMultipleAttributes.java index 516c2ade..68e58ffa 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMeanMultipleAttributes.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMeanMultipleAttributes.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -30,6 +30,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; @@ -60,7 +61,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; * <p> * Implementation note: attribute standardization is not used; this is * equivalent to using the - * {@link de.lmu.ifi.dbs.elki.datasource.filter.AttributeWiseVarianceNormalization + * {@link de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseVarianceNormalization * AttributeWiseVarianceNormalization} filter. * </p> * @@ -115,14 +116,12 @@ public class CTLuMeanMultipleAttributes<N, O extends NumberVector<?, ?>> extends Matrix cmati = covmaker.destroyToSampleMatrix().inverse(); DoubleMinMax minmax = new DoubleMinMax(); - WritableDataStore<Double> scores = DataStoreUtil.makeStorage(attributes.getDBIDs(), DataStoreFactory.HINT_STATIC, Double.class); + WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(attributes.getDBIDs(), DataStoreFactory.HINT_STATIC); for(DBID id : attributes.iterDBIDs()) { Vector temp = deltas.get(id).minus(mean); - final Vector res = temp.transposeTimes(cmati).times(temp); - assert (res.getDimensionality() == 1); - double score = res.get(0); + final double score = temp.transposeTimesTimes(cmati, temp); minmax.put(score); - scores.put(id, score); + scores.putDouble(id, score); } Relation<Double> scoreResult = new MaterializedRelation<Double>("mean multiple attributes spatial outlier", "mean-multipleattributes-outlier", TypeUtil.DOUBLE, scores, attributes.getDBIDs()); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMedianAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMedianAlgorithm.java index 7f397790..9b4534fe 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMedianAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMedianAlgorithm.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -29,7 +29,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
-import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
@@ -37,10 +37,10 @@ import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
import de.lmu.ifi.dbs.elki.math.MeanVariance;
-import de.lmu.ifi.dbs.elki.math.statistics.QuickSelect;
import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
@@ -91,7 +91,7 @@ public class CTLuMedianAlgorithm<N> extends AbstractNeighborhoodOutlier<N> { */
public OutlierResult run(Relation<N> nrel, Relation<? extends NumberVector<?, ?>> relation) {
final NeighborSetPredicate npred = getNeighborSetPredicateFactory().instantiate(nrel);
- WritableDataStore<Double> scores = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, Double.class);
+ WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
MeanVariance mv = new MeanVariance();
for(DBID id : relation.iterDBIDs()) {
@@ -110,15 +110,14 @@ public class CTLuMedianAlgorithm<N> extends AbstractNeighborhoodOutlier<N> { }
if(c > 0) {
- // Note: only use up to c-1, since we may have used a too big array
- median = QuickSelect.median(fi, 0, c - 1);
+ median = QuickSelect.median(fi, 0, c);
}
else {
median = relation.get(id).doubleValue(1);
}
}
double h = relation.get(id).doubleValue(1) - median;
- scores.put(id, h);
+ scores.putDouble(id, h);
mv.put(h);
}
@@ -127,9 +126,9 @@ public class CTLuMedianAlgorithm<N> extends AbstractNeighborhoodOutlier<N> { final double stddev = mv.getNaiveStddev();
DoubleMinMax minmax = new DoubleMinMax();
for(DBID id : relation.iterDBIDs()) {
- double score = Math.abs((scores.get(id) - mean) / stddev);
+ double score = Math.abs((scores.doubleValue(id) - mean) / stddev);
minmax.put(score);
- scores.put(id, score);
+ scores.putDouble(id, score);
}
Relation<Double> scoreResult = new MaterializedRelation<Double>("MO", "Median-outlier", TypeUtil.DOUBLE, scores, relation.getDBIDs());
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMedianMultipleAttributes.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMedianMultipleAttributes.java index 649511eb..cbf61c38 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMedianMultipleAttributes.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMedianMultipleAttributes.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -30,6 +30,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; @@ -39,11 +40,11 @@ import de.lmu.ifi.dbs.elki.math.DoubleMinMax; import de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix; import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; -import de.lmu.ifi.dbs.elki.math.statistics.QuickSelect; import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta; import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; /** @@ -61,7 +62,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; * <p> * Implementation note: attribute standardization is not used; this is * equivalent to using the - * {@link de.lmu.ifi.dbs.elki.datasource.filter.AttributeWiseVarianceNormalization + * {@link de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseVarianceNormalization * AttributeWiseVarianceNormalization} filter. * </p> * @@ -141,14 +142,12 @@ public class CTLuMedianMultipleAttributes<N, O extends NumberVector<?, ?>> exten Matrix cmati = covmaker.destroyToSampleMatrix().inverse(); DoubleMinMax minmax = new DoubleMinMax(); - WritableDataStore<Double> scores = DataStoreUtil.makeStorage(attributes.getDBIDs(), DataStoreFactory.HINT_STATIC, Double.class); + WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(attributes.getDBIDs(), DataStoreFactory.HINT_STATIC); for(DBID id : attributes.iterDBIDs()) { Vector temp = deltas.get(id).minus(mean); - final Vector res = temp.transposeTimes(cmati).times(temp); - assert (res.getDimensionality() == 1); - double score = res.get(0); + final double score = temp.transposeTimesTimes(cmati, temp); minmax.put(score); - scores.put(id, score); + scores.putDouble(id, score); } Relation<Double> scoreResult = new MaterializedRelation<Double>("Median multiple attributes outlier", "median-outlier", TypeUtil.DOUBLE, scores, attributes.getDBIDs()); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMoranScatterplotOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMoranScatterplotOutlier.java index 6780fcc9..9f19757d 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMoranScatterplotOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMoranScatterplotOutlier.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -31,7 +31,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; -import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -103,7 +103,7 @@ public class CTLuMoranScatterplotOutlier<N> extends AbstractNeighborhoodOutlier< } DoubleMinMax minmax = new DoubleMinMax(); - WritableDataStore<Double> scores = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, Double.class); + WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); // calculate normalized attribute values // calculate neighborhood average of normalized attribute values. @@ -132,7 +132,7 @@ public class CTLuMoranScatterplotOutlier<N> extends AbstractNeighborhoodOutlier< // Note: in the original moran scatterplot, any object with a score < 0 would be an outlier. final double score = Math.max(-globalZ * localZ, 0); minmax.put(score); - scores.put(id, score); + scores.putDouble(id, score); } Relation<Double> scoreResult = new MaterializedRelation<Double>("MoranOutlier", "Moran Scatterplot Outlier", TypeUtil.DOUBLE, scores, relation.getDBIDs()); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuRandomWalkEC.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuRandomWalkEC.java index 0fe65fee..a6425d43 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuRandomWalkEC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuRandomWalkEC.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -31,6 +31,7 @@ import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
@@ -196,13 +197,13 @@ public class CTLuRandomWalkEC<N, D extends NumberDistance<D, ?>> extends Abstrac for(int i = 0; i < ids.size(); i++) {
DBID id = ids.get(i);
// Note: matrix times ith unit vector = ith column
- Vector sim = E.getColumnVector(i);
+ Vector sim = E.getCol(i);
similarityVectors.put(id, sim);
}
E = null;
// compute the relevance scores between specified Object and its neighbors
DoubleMinMax minmax = new DoubleMinMax();
- WritableDataStore<Double> scores = DataStoreUtil.makeStorage(spatial.getDBIDs(), DataStoreFactory.HINT_STATIC, Double.class);
+ WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(spatial.getDBIDs(), DataStoreFactory.HINT_STATIC);
for(int i = 0; i < ids.size(); i++) {
DBID id = ids.get(i);
double gmean = 1.0;
@@ -211,13 +212,13 @@ public class CTLuRandomWalkEC<N, D extends NumberDistance<D, ?>> extends Abstrac if(id.equals(n)) {
continue;
}
- double sim = MathUtil.cosineSimilarity(similarityVectors.get(id), similarityVectors.get(n));
+ double sim = MathUtil.angle(similarityVectors.get(id), similarityVectors.get(n));
gmean *= sim;
cnt++;
}
final double score = Math.pow(gmean, 1.0 / cnt);
minmax.put(score);
- scores.put(id, score);
+ scores.putDouble(id, score);
}
Relation<Double> scoreResult = new MaterializedRelation<Double>("randomwalkec", "RandomWalkEC", TypeUtil.DOUBLE, scores, relation.getDBIDs());
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuScatterplotOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuScatterplotOutlier.java index d18e4130..8e4ab32c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuScatterplotOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuScatterplotOutlier.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -30,7 +30,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; -import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; @@ -97,7 +97,7 @@ public class CTLuScatterplotOutlier<N> extends AbstractNeighborhoodOutlier<N> { */ public OutlierResult run(Relation<N> nrel, Relation<? extends NumberVector<?, ?>> relation) { final NeighborSetPredicate npred = getNeighborSetPredicateFactory().instantiate(nrel); - WritableDataStore<Double> means = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP, Double.class); + WritableDoubleDataStore means = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP); // Calculate average of neighborhood for each object and perform a linear // regression using the covariance matrix @@ -122,7 +122,7 @@ public class CTLuScatterplotOutlier<N> extends AbstractNeighborhoodOutlier<N> { m = local; } // Store the mean for the score calculation - means.put(id, m); + means.putDouble(id, m); covm.put(new double[] { local, m }); } // Finalize covariance matrix, compute linear regression @@ -137,13 +137,13 @@ public class CTLuScatterplotOutlier<N> extends AbstractNeighborhoodOutlier<N> { } // calculate mean and variance for error - WritableDataStore<Double> scores = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, Double.class); + WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); MeanVariance mv = new MeanVariance(); for(DBID id : relation.iterDBIDs()) { // Compute the error from the linear regression double y_i = relation.get(id).doubleValue(1); - double e = means.get(id) - (slope * y_i + inter); - scores.put(id, e); + double e = means.doubleValue(id) - (slope * y_i + inter); + scores.putDouble(id, e); mv.put(e); } @@ -153,9 +153,9 @@ public class CTLuScatterplotOutlier<N> extends AbstractNeighborhoodOutlier<N> { final double mean = mv.getMean(); final double variance = mv.getNaiveStddev(); for(DBID id : relation.iterDBIDs()) { - double score = Math.abs((scores.get(id) - mean) / variance); + double score = Math.abs((scores.doubleValue(id) - mean) / variance); minmax.put(score); - scores.put(id, score); + scores.putDouble(id, score); } } // build representation diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuZTestOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuZTestOutlier.java index 304203db..573e1526 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuZTestOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuZTestOutlier.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -31,7 +31,7 @@ import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; -import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; @@ -99,7 +99,7 @@ public class CTLuZTestOutlier<N> extends AbstractNeighborhoodOutlier<N> { */ public OutlierResult run(Database database, Relation<N> nrel, Relation<? extends NumberVector<?, ?>> relation) { final NeighborSetPredicate npred = getNeighborSetPredicateFactory().instantiate(nrel); - WritableDataStore<Double> scores = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, Double.class); + WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); MeanVariance zmv = new MeanVariance(); for(DBID id : relation.iterDBIDs()) { @@ -121,16 +121,16 @@ public class CTLuZTestOutlier<N> extends AbstractNeighborhoodOutlier<N> { else { localdiff = 0.0; } - scores.put(id, localdiff); + scores.putDouble(id, localdiff); zmv.put(localdiff); } // Normalize scores using mean and variance DoubleMinMax minmax = new DoubleMinMax(); for(DBID id : relation.iterDBIDs()) { - double score = Math.abs(scores.get(id) - zmv.getMean()) / zmv.getSampleStddev(); + double score = Math.abs(scores.doubleValue(id) - zmv.getMean()) / zmv.getSampleStddev(); minmax.put(score); - scores.put(id, score); + scores.putDouble(id, score); } // Wrap result diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/SLOM.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/SLOM.java index af8762e0..e69d46d4 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/SLOM.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/SLOM.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -29,16 +29,16 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; -import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; -import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta; import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; @@ -96,7 +96,7 @@ public class SLOM<N, O, D extends NumberDistance<D, ?>> extends AbstractDistance final NeighborSetPredicate npred = getNeighborSetPredicateFactory().instantiate(spatial); DistanceQuery<O, D> distFunc = getNonSpatialDistanceFunction().instantiate(relation); - WritableDataStore<Double> modifiedDistance = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, Double.class); + WritableDoubleDataStore modifiedDistance = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); // calculate D-Tilde for(DBID id : relation.iterDBIDs()) { double sum = 0; @@ -114,18 +114,18 @@ public class SLOM<N, O, D extends NumberDistance<D, ?>> extends AbstractDistance maxDist = Math.max(maxDist, dist); } if(cnt > 1) { - modifiedDistance.put(id, ((sum - maxDist) / (cnt - 1))); + modifiedDistance.putDouble(id, ((sum - maxDist) / (cnt - 1))); } else { // Use regular distance when the d-tilde trick is undefined. // Note: this can be 0 when there were no neighbors. - modifiedDistance.put(id, maxDist); + modifiedDistance.putDouble(id, maxDist); } } // Second step - compute actual SLOM values DoubleMinMax slomminmax = new DoubleMinMax(); - WritableDataStore<Double> sloms = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, Double.class); + WritableDoubleDataStore sloms = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); for(DBID id : relation.iterDBIDs()) { double sum = 0; @@ -136,18 +136,18 @@ public class SLOM<N, O, D extends NumberDistance<D, ?>> extends AbstractDistance if(neighbor.equals(id)) { continue; } - sum += modifiedDistance.get(neighbor); + sum += modifiedDistance.doubleValue(neighbor); cnt++; } double slom; if(cnt > 0) { // With and without the object itself: - double avgPlus = (sum + modifiedDistance.get(id)) / (cnt + 1); + double avgPlus = (sum + modifiedDistance.doubleValue(id)) / (cnt + 1); double avg = sum / cnt; double beta = 0; for(DBID neighbor : neighbors) { - final double dist = modifiedDistance.get(neighbor).doubleValue(); + final double dist = modifiedDistance.doubleValue(neighbor); if(dist > avgPlus) { beta += 1; } @@ -157,7 +157,7 @@ public class SLOM<N, O, D extends NumberDistance<D, ?>> extends AbstractDistance } // Include object itself if(!neighbors.contains(id)) { - final double dist = modifiedDistance.get(id).doubleValue(); + final double dist = modifiedDistance.doubleValue(id); if(dist > avgPlus) { beta += 1; } @@ -176,13 +176,13 @@ public class SLOM<N, O, D extends NumberDistance<D, ?>> extends AbstractDistance } beta = beta / (1 + avg); - slom = beta * modifiedDistance.get(id); + slom = beta * modifiedDistance.doubleValue(id); } else { // No neighbors to compare to - no score. slom = 0.0; } - sloms.put(id, slom); + sloms.putDouble(id, slom); slomminmax.put(slom); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/SOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/SOF.java index e3ede8bb..abc3c481 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/SOF.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/SOF.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -28,16 +28,16 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
-import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
-import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta;
@@ -103,8 +103,8 @@ public class SOF<N, O, D extends NumberDistance<D, ?>> extends AbstractDistanceB final NeighborSetPredicate npred = getNeighborSetPredicateFactory().instantiate(spatial);
DistanceQuery<O, D> distFunc = getNonSpatialDistanceFunction().instantiate(relation);
- WritableDataStore<Double> lrds = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, Double.class);
- WritableDataStore<Double> lofs = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, Double.class);
+ WritableDoubleDataStore lrds = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT);
+ WritableDoubleDataStore lofs = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
DoubleMinMax lofminmax = new DoubleMinMax();
// Compute densities
@@ -118,7 +118,7 @@ public class SOF<N, O, D extends NumberDistance<D, ?>> extends AbstractDistanceB if (Double.isNaN(lrd)) {
lrd = 0;
}
- lrds.put(id, lrd);
+ lrds.putDouble(id, lrd);
}
// Compute density quotients
@@ -126,14 +126,14 @@ public class SOF<N, O, D extends NumberDistance<D, ?>> extends AbstractDistanceB DBIDs neighbors = npred.getNeighborDBIDs(id);
double avg = 0;
for(DBID n : neighbors) {
- avg += lrds.get(n);
+ avg += lrds.doubleValue(n);
}
- final double lrd = (avg / neighbors.size()) / lrds.get(id);
+ final double lrd = (avg / neighbors.size()) / lrds.doubleValue(id);
if (!Double.isNaN(lrd)) {
- lofs.put(id, lrd);
+ lofs.putDouble(id, lrd);
lofminmax.put(lrd);
} else {
- lofs.put(id, 0.0);
+ lofs.putDouble(id, 0.0);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/TrimmedMeanApproach.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/TrimmedMeanApproach.java index d99ecc99..75700bca 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/TrimmedMeanApproach.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/TrimmedMeanApproach.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -32,7 +32,7 @@ import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
-import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
@@ -41,11 +41,11 @@ import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
import de.lmu.ifi.dbs.elki.math.Mean;
-import de.lmu.ifi.dbs.elki.math.statistics.QuickSelect;
import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
@@ -112,8 +112,8 @@ public class TrimmedMeanApproach<N> extends AbstractNeighborhoodOutlier<N> { assert (DatabaseUtil.dimensionality(relation) == 1) : "TrimmedMean can only process one-dimensional data sets.";
final NeighborSetPredicate npred = getNeighborSetPredicateFactory().instantiate(nrel);
- WritableDataStore<Double> errors = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP, Double.class);
- WritableDataStore<Double> scores = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, Double.class);
+ WritableDoubleDataStore errors = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP);
+ WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Computing trimmed means", relation.size(), logger) : null;
for(DBID id : relation.iterDBIDs()) {
@@ -142,7 +142,7 @@ public class TrimmedMeanApproach<N> extends AbstractNeighborhoodOutlier<N> { tm = relation.get(id).doubleValue(1);
}
// Error: deviation from trimmed mean
- errors.put(id, relation.get(id).doubleValue(1) - tm);
+ errors.putDouble(id, relation.get(id).doubleValue(1) - tm);
if(progress != null) {
progress.incrementProcessed(logger);
@@ -162,7 +162,7 @@ public class TrimmedMeanApproach<N> extends AbstractNeighborhoodOutlier<N> { {
int i = 0;
for(DBID id : relation.iterDBIDs()) {
- ei[i] = errors.get(id);
+ ei[i] = errors.doubleValue(id);
i++;
}
}
@@ -181,8 +181,8 @@ public class TrimmedMeanApproach<N> extends AbstractNeighborhoodOutlier<N> { // calculate score
DoubleMinMax minmax = new DoubleMinMax();
for(DBID id : relation.iterDBIDs()) {
- double score = Math.abs(errors.get(id)) * 0.6745 / median_dev_from_median;
- scores.put(id, score);
+ double score = Math.abs(errors.doubleValue(id)) * 0.6745 / median_dev_from_median;
+ scores.putDouble(id, score);
minmax.put(score);
}
//
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/AbstractPrecomputedNeighborhood.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/AbstractPrecomputedNeighborhood.java index df28de24..5898b053 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/AbstractPrecomputedNeighborhood.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/AbstractPrecomputedNeighborhood.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.neighborhood; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExtendedNeighborhood.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExtendedNeighborhood.java index 640c30fe..9ee92d35 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExtendedNeighborhood.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExtendedNeighborhood.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.neighborhood; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -31,6 +31,7 @@ import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.logging.Logging; @@ -132,15 +133,15 @@ public class ExtendedNeighborhood extends AbstractPrecomputedNeighborhood { // Expand multiple steps FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Expanding neighborhoods", database.size(), logger) : null; for(final DBID id : database.iterDBIDs()) { - ModifiableDBIDs res = DBIDUtil.newHashSet(id); + HashSetModifiableDBIDs res = DBIDUtil.newHashSet(id); DBIDs todo = id; for(int i = 0; i < steps; i++) { ModifiableDBIDs ntodo = DBIDUtil.newHashSet(); for(final DBID oid : todo) { DBIDs add = innerinst.getNeighborDBIDs(oid); if(add != null) { - for (DBID nid: add) { - if (res.contains(add)) { + for(DBID nid : add) { + if(res.contains(nid)) { continue; } ntodo.add(nid); @@ -148,7 +149,7 @@ public class ExtendedNeighborhood extends AbstractPrecomputedNeighborhood { } } } - if (ntodo.size() == 0) { + if(ntodo.size() == 0) { continue; } todo = ntodo; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExternalNeighborhood.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExternalNeighborhood.java index 789a63e7..f2586e2e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExternalNeighborhood.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExternalNeighborhood.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.neighborhood; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/NeighborSetPredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/NeighborSetPredicate.java index 6683b3fd..3a6d0e28 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/NeighborSetPredicate.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/NeighborSetPredicate.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.neighborhood; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/PrecomputedKNearestNeighborNeighborhood.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/PrecomputedKNearestNeighborNeighborhood.java index fa878778..f5ea7e15 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/PrecomputedKNearestNeighborNeighborhood.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/PrecomputedKNearestNeighborNeighborhood.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.neighborhood; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -22,8 +22,6 @@ You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import java.util.List;
-
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.database.QueryUtil;
import de.lmu.ifi.dbs.elki.database.datastore.DataStore;
@@ -36,6 +34,7 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair;
import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
+import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
@@ -121,7 +120,7 @@ public class PrecomputedKNearestNeighborNeighborhood<D extends Distance<D>> exte // TODO: use bulk?
WritableDataStore<DBIDs> s = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, DBIDs.class);
for(DBID id : relation.iterDBIDs()) {
- List<DistanceResultPair<D>> neighbors = knnQuery.getKNNForDBID(id, k);
+ KNNResult<D> neighbors = knnQuery.getKNNForDBID(id, k);
ArrayModifiableDBIDs neighbours = DBIDUtil.newArray(neighbors.size());
for(DistanceResultPair<D> dpair : neighbors) {
neighbours.add(dpair.getDBID());
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/package-info.java index eb490642..47ca5ad2 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/package-info.java @@ -5,7 +5,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/LinearWeightedExtendedNeighborhood.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/LinearWeightedExtendedNeighborhood.java index b2f70e16..52fc2c46 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/LinearWeightedExtendedNeighborhood.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/LinearWeightedExtendedNeighborhood.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.neighborhood.weighted; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/UnweightedNeighborhoodAdapter.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/UnweightedNeighborhoodAdapter.java index 0927c026..4378aa2e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/UnweightedNeighborhoodAdapter.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/UnweightedNeighborhoodAdapter.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.neighborhood.weighted; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/WeightedNeighborSetPredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/WeightedNeighborSetPredicate.java index d0cdfa9c..b147935a 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/WeightedNeighborSetPredicate.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/WeightedNeighborSetPredicate.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.neighborhood.weighted; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/package-info.java index ff82dbee..39165cfd 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/package-info.java @@ -5,7 +5,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/package-info.java index df50e592..13bf3f25 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/package-info.java @@ -5,7 +5,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/ByLabelOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/ByLabelOutlier.java index 62f083fb..86730404 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/ByLabelOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/ByLabelOutlier.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.trivial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -34,7 +34,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; -import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -111,7 +111,7 @@ public class ByLabelOutlier extends AbstractAlgorithm<OutlierResult> implements * @return Result */ public OutlierResult run(Relation<?> relation) { - WritableDataStore<Double> scores = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT, Double.class); + WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT); for(DBID id : relation.iterDBIDs()) { String label = relation.get(id).toString(); final double score; @@ -120,7 +120,7 @@ public class ByLabelOutlier extends AbstractAlgorithm<OutlierResult> implements } else { score = 0.0; } - scores.put(id, score); + scores.putDouble(id, score); } Relation<Double> scoreres = new MaterializedRelation<Double>("By label outlier scores", "label-outlier", TypeUtil.DOUBLE, scores, relation.getDBIDs()); OutlierScoreMeta meta = new ProbabilisticOutlierScore(); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialAllOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialAllOutlier.java index ff93e0ed..509e35e9 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialAllOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialAllOutlier.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.trivial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -29,7 +29,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; -import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -69,9 +69,9 @@ public class TrivialAllOutlier extends AbstractAlgorithm<OutlierResult> implemen * @return Result */ public OutlierResult run(Relation<?> relation) { - WritableDataStore<Double> scores = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT, Double.class); + WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT); for(DBID id : relation.iterDBIDs()) { - scores.put(id, 1.0); + scores.putDouble(id, 1.0); } Relation<Double> scoreres = new MaterializedRelation<Double>("Trivial all-outlier score", "all-outlier", TypeUtil.DOUBLE, scores, relation.getDBIDs()); OutlierScoreMeta meta = new ProbabilisticOutlierScore(); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialGeneratedOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialGeneratedOutlier.java new file mode 100644 index 00000000..db40ff30 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialGeneratedOutlier.java @@ -0,0 +1,212 @@ +package de.lmu.ifi.dbs.elki.algorithm.outlier.trivial; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.HashSet; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.Model; +import de.lmu.ifi.dbs.elki.data.synthetic.bymodel.GeneratorSingleCluster; +import de.lmu.ifi.dbs.elki.data.type.NoSupportedDataTypeException; +import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.math.statistics.distribution.ChiSquaredDistribution; +import de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution; +import de.lmu.ifi.dbs.elki.math.statistics.distribution.NormalDistribution; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; +import de.lmu.ifi.dbs.elki.result.outlier.ProbabilisticOutlierScore; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; + +/** + * Extract outlier score from the model the objects were generated by. + * + * This algorithm can only be applied to data that was freshly generated, to the + * generator model information is still available. + * + * @author Erich Schubert + */ +public class TrivialGeneratedOutlier extends AbstractAlgorithm<OutlierResult> implements OutlierAlgorithm { + /** + * Class logger + */ + private static final Logging logger = Logging.getLogger(TrivialGeneratedOutlier.class); + + /** + * Expected share of outliers + */ + public static final OptionID EXPECT_ID = OptionID.getOrCreateOptionID("modeloutlier.expect", "Expected amount of outliers, for making the scores more intuitive."); + + /** + * Expected share of outliers. + */ + double expect = 0.01; + + /** + * Constructor. + * + * @param expect Expected share of outliers + */ + public TrivialGeneratedOutlier(double expect) { + super(); + this.expect = expect; + } + + /** + * Constructor. + */ + public TrivialGeneratedOutlier() { + this(0.01); + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD, new SimpleTypeInformation<Model>(Model.class), TypeUtil.GUESSED_LABEL); + } + + @Override + public OutlierResult run(Database database) throws IllegalStateException { + Relation<NumberVector<?, ?>> vecs = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD); + Relation<Model> models = database.getRelation(new SimpleTypeInformation<Model>(Model.class)); + // Prefer a true class label + try { + Relation<?> relation = database.getRelation(TypeUtil.CLASSLABEL); + return run(models, vecs, relation); + } + catch(NoSupportedDataTypeException e) { + // Otherwise, try any labellike. + return run(models, vecs, database.getRelation(TypeUtil.GUESSED_LABEL)); + } + } + + /** + * Run the algorithm + * + * @param models Model relation + * @param vecs Vector relation + * @param labels Label relation + * @return Outlier result + */ + public OutlierResult run(Relation<Model> models, Relation<NumberVector<?, ?>> vecs, Relation<?> labels) { + WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(models.getDBIDs(), DataStoreFactory.HINT_HOT); + + // Adjustment constant + final double minscore = expect / (expect + 1); + + HashSet<GeneratorSingleCluster> generators = new HashSet<GeneratorSingleCluster>(); + for(DBID id : models.iterDBIDs()) { + Model model = models.get(id); + if(model instanceof GeneratorSingleCluster) { + generators.add((GeneratorSingleCluster) model); + } + } + if(generators.size() == 0) { + logger.warning("No generator models found for dataset - all points will be considered outliers."); + } + + for(DBID id : models.iterDBIDs()) { + double score = 0.0; + // Convert to a math vector + Vector v = vecs.get(id).getColumnVector(); + for(GeneratorSingleCluster gen : generators) { + Vector tv = v; + // Transform backwards + if(gen.getTransformation() != null) { + tv = gen.getTransformation().applyInverse(v); + } + final int dim = tv.getDimensionality(); + double lensq = 0.0; + int norm = 0; + for(int i = 0; i < dim; i++) { + Distribution dist = gen.getDistribution(i); + if(dist instanceof NormalDistribution) { + NormalDistribution d = (NormalDistribution) dist; + double delta = (tv.get(i) - d.getMean()) / d.getStddev(); + lensq += delta * delta; + norm += 1; + } + } + if(norm > 0) { + // The squared distances are ChiSquared distributed + score = Math.max(score, 1 - ChiSquaredDistribution.cdf(lensq, norm)); + } + } + // score inversion. + score = expect / (expect + score); + // adjust to 0 to 1 range: + score = (score - minscore) / (1 - minscore); + scores.putDouble(id, score); + } + Relation<Double> scoreres = new MaterializedRelation<Double>("Model outlier scores", "model-outlier", TypeUtil.DOUBLE, scores, models.getDBIDs()); + OutlierScoreMeta meta = new ProbabilisticOutlierScore(0., 1.); + return new OutlierResult(meta, scoreres); + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Expected share of outliers + */ + double expect; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + DoubleParameter expectP = new DoubleParameter(EXPECT_ID, 0.01); + if(config.grab(expectP)) { + expect = expectP.getValue(); + } + } + + @Override + protected TrivialGeneratedOutlier makeInstance() { + return new TrivialGeneratedOutlier(expect); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialNoOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialNoOutlier.java index f3ae7e72..cff2ad2c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialNoOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialNoOutlier.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.trivial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -29,7 +29,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; -import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -69,9 +69,9 @@ public class TrivialNoOutlier extends AbstractAlgorithm<OutlierResult> implement * @return Result */ public OutlierResult run(Relation<?> relation) throws IllegalStateException { - WritableDataStore<Double> scores = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT, Double.class); + WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT); for(DBID id : relation.iterDBIDs()) { - scores.put(id, 0.0); + scores.putDouble(id, 0.0); } Relation<Double> scoreres = new MaterializedRelation<Double>("Trivial no-outlier score", "no-outlier", TypeUtil.DOUBLE, scores, relation.getDBIDs()); OutlierScoreMeta meta = new ProbabilisticOutlierScore(); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/package-info.java index fbae60dc..d49d3565 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/package-info.java @@ -7,7 +7,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/package-info.java index 0dfbc8c1..c18579f0 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/package-info.java @@ -10,7 +10,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AveragePrecisionAtK.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AveragePrecisionAtK.java new file mode 100644 index 00000000..1c74621b --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AveragePrecisionAtK.java @@ -0,0 +1,235 @@ +package de.lmu.ifi.dbs.elki.algorithm.statistics; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.data.DoubleVector; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.result.CollectionResult; +import de.lmu.ifi.dbs.elki.result.HistogramResult; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.IntervalConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.IntervalConstraint.IntervalBoundary; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter; + +/** + * Evaluate a distance functions performance by computing the average precision + * at k, when ranking the objects by distance. + * + * @author Erich Schubert + * @param <V> Vector type + * @param <D> Distance type + */ +public class AveragePrecisionAtK<V extends Object, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<V, D, CollectionResult<DoubleVector>> { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(AveragePrecisionAtK.class); + + /** + * The parameter k - the number of neighbors to retrieve. + */ + private int k; + + /** + * Relative number of object to use in sampling. + */ + private double sampling = 1.0; + + /** + * Random sampling seed. + */ + private Long seed = null; + + /** + * Constructor. + * + * @param distanceFunction Distance function + * @param sampling Sampling rate + * @param seed Random sampling seed (may be null) + */ + public AveragePrecisionAtK(DistanceFunction<? super V, D> distanceFunction, int k, double sampling, Long seed) { + super(distanceFunction); + this.k = k; + this.sampling = sampling; + this.seed = seed; + } + + @Override + public HistogramResult<DoubleVector> run(Database database) throws IllegalStateException { + final Relation<V> relation = database.getRelation(getInputTypeRestriction()[0]); + final Relation<Object> lrelation = database.getRelation(getInputTypeRestriction()[1]); + final DistanceQuery<V, D> distQuery = database.getDistanceQuery(relation, getDistanceFunction()); + final KNNQuery<V, D> knnQuery = database.getKNNQuery(distQuery, k); + + MeanVariance[] mvs = MeanVariance.newArray(k); + + final DBIDs ids; + if (sampling < 1.0) { + int size = Math.max(1, (int) (sampling * relation.size())); + ids = DBIDUtil.randomSample(relation.getDBIDs(), size, seed); + } else { + ids = relation.getDBIDs(); + } + + if(logger.isVerbose()) { + logger.verbose("Processing points..."); + } + FiniteProgress objloop = logger.isVerbose() ? new FiniteProgress("Computing nearest neighbors", ids.size(), logger) : null; + // sort neighbors + for(DBID id : ids) { + KNNResult<D> knn = knnQuery.getKNNForDBID(id, k); + Object label = lrelation.get(id); + + int positive = 0; + Iterator<DistanceResultPair<D>> ri = knn.iterator(); + for(int i = 0; i < k && ri.hasNext(); i++) { + DBID nid = ri.next().getDBID(); + Object olabel = lrelation.get(nid); + if(label == null) { + if(olabel == null) { + positive += 1; + } + } + else { + if(label.equals(olabel)) { + positive += 1; + } + } + final double precision = positive / (double) (i + 1); + mvs[i].put(precision); + } + if(objloop != null) { + objloop.incrementProcessed(logger); + } + } + if(objloop != null) { + objloop.ensureCompleted(logger); + } + // Collections.sort(results); + + // Transform Histogram into a Double Vector array. + Collection<DoubleVector> res = new ArrayList<DoubleVector>(k); + for(int i = 0; i < k; i++) { + DoubleVector row = new DoubleVector(new double[] { mvs[i].getMean(), mvs[i].getSampleStddev() }); + res.add(row); + } + return new HistogramResult<DoubleVector>("Average Precision", "average-precision", res); + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(getDistanceFunction().getInputTypeRestriction(), TypeUtil.GUESSED_LABEL); + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector<V, ?>, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<V, D> { + /** + * Parameter k to compute the average precision at. + */ + private static final OptionID K_ID = OptionID.getOrCreateOptionID("avep.k", "K to compute the average precision at."); + + /** + * Parameter to enable sampling + */ + public static final OptionID SAMPLING_ID = OptionID.getOrCreateOptionID("avep.sampling", "Relative amount of object to sample."); + + /** + * Parameter to control the sampling random seed + */ + public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("avep.sampling-seed", "Random seed for deterministic sampling."); + + /** + * Neighborhood size + */ + protected int k = 20; + + /** + * Relative amount of data to sample + */ + protected double sampling = 1.0; + + /** + * Random sampling seed. + */ + protected Long seed = null; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + final IntParameter kP = new IntParameter(K_ID, new GreaterEqualConstraint(2)); + if(config.grab(kP)) { + k = kP.getValue(); + } + final DoubleParameter samplingP = new DoubleParameter(SAMPLING_ID, new IntervalConstraint(0.0, IntervalBoundary.OPEN, 1.0, IntervalBoundary.CLOSE), true); + if (config.grab(samplingP)) { + sampling = samplingP.getValue(); + } + final LongParameter seedP = new LongParameter(SEED_ID, true); + if (config.grab(seedP)) { + seed = seedP.getValue(); + } + } + + @Override + protected AveragePrecisionAtK<V, D> makeInstance() { + return new AveragePrecisionAtK<V, D>(distanceFunction, k, sampling, seed); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java index 33201b67..78bbf5f4 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.statistics; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -49,10 +49,10 @@ import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; -import de.lmu.ifi.dbs.elki.math.AggregatingHistogram; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; -import de.lmu.ifi.dbs.elki.math.FlexiHistogram; import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.math.histograms.AggregatingHistogram; +import de.lmu.ifi.dbs.elki.math.histograms.FlexiHistogram; import de.lmu.ifi.dbs.elki.result.CollectionResult; import de.lmu.ifi.dbs.elki.result.HistogramResult; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; @@ -65,6 +65,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameteriz import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Parameter; +import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleObjPair; import de.lmu.ifi.dbs.elki.utilities.pairs.FCPair; import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; @@ -249,7 +250,7 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex // count the number of samples we have in the data long inum = 0; long onum = 0; - for(Pair<Double, Pair<Long, Long>> ppair : histogram) { + for(DoubleObjPair<Pair<Long, Long>> ppair : histogram) { inum += ppair.getSecond().getFirst(); onum += ppair.getSecond().getSecond(); } @@ -258,12 +259,12 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex assert (bnum == relation.size() * (relation.size() - 1)); Collection<DoubleVector> binstat = new ArrayList<DoubleVector>(numbin); - for(Pair<Double, Pair<Long, Long>> ppair : histogram) { + for(DoubleObjPair<Pair<Long, Long>> ppair : histogram) { final double icof = (inum == 0) ? 0 : ((double) ppair.getSecond().getFirst()) / inum / histogram.getBinsize(); final double icaf = ((double) ppair.getSecond().getFirst()) / bnum / histogram.getBinsize(); final double ocof = (onum == 0) ? 0 : ((double) ppair.getSecond().getSecond()) / onum / histogram.getBinsize(); final double ocaf = ((double) ppair.getSecond().getSecond()) / bnum / histogram.getBinsize(); - DoubleVector row = new DoubleVector(new double[] { ppair.getFirst(), icof, icaf, ocof, ocaf }); + DoubleVector row = new DoubleVector(new double[] { ppair.first, icof, icaf, ocof, ocaf }); binstat.add(row); } HistogramResult<DoubleVector> result = new HistogramResult<DoubleVector>("Distance Histogram", "distance-histogram", binstat); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java index 58fb5b89..c1eb118d 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.statistics; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -27,7 +27,6 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; -import java.util.List; import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelClustering; @@ -40,18 +39,18 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBID; -import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.evaluation.roc.ROC; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; -import de.lmu.ifi.dbs.elki.math.AggregatingHistogram; import de.lmu.ifi.dbs.elki.math.MathUtil; import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.math.histograms.AggregatingHistogram; import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.result.CollectionResult; @@ -63,8 +62,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleObjPair; import de.lmu.ifi.dbs.elki.utilities.pairs.FCPair; -import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; /** * Evaluate a distance function with respect to kNN queries. For each point, the @@ -159,7 +158,7 @@ public class EvaluateRankingQuality<V extends NumberVector<V, ?>, D extends Numb for(int ind = 0; ind < cmem.size(); ind++) { DBID i1 = cmem.get(ind).getSecond(); - List<DistanceResultPair<D>> knn = knnQuery.getKNNForDBID(i1, relation.size()); + KNNResult<D> knn = knnQuery.getKNNForDBID(i1, relation.size()); double result = ROC.computeROCAUCDistanceResult(relation.size(), clus, knn); hist.aggregate(((double) ind) / clus.size(), result); @@ -176,8 +175,8 @@ public class EvaluateRankingQuality<V extends NumberVector<V, ?>, D extends Numb // Transform Histogram into a Double Vector array. Collection<DoubleVector> res = new ArrayList<DoubleVector>(relation.size()); - for(Pair<Double, MeanVariance> pair : hist) { - DoubleVector row = new DoubleVector(new double[] { pair.getFirst(), pair.getSecond().getCount(), pair.getSecond().getMean(), pair.getSecond().getSampleVariance() }); + for(DoubleObjPair<MeanVariance> pair : hist) { + DoubleVector row = new DoubleVector(new double[] { pair.first, pair.getSecond().getCount(), pair.getSecond().getMean(), pair.getSecond().getSampleVariance() }); res.add(row); } return new HistogramResult<DoubleVector>("Ranking Quality Histogram", "ranking-histogram", res); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java index 07e6795a..6d64dc55 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.statistics; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -25,7 +25,6 @@ package de.lmu.ifi.dbs.elki.algorithm.statistics; import java.util.ArrayList; import java.util.Collection; -import java.util.List; import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelClustering; @@ -36,17 +35,17 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBID; -import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.evaluation.roc.ROC; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; -import de.lmu.ifi.dbs.elki.math.AggregatingHistogram; import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.math.histograms.AggregatingHistogram; import de.lmu.ifi.dbs.elki.result.CollectionResult; import de.lmu.ifi.dbs.elki.result.HistogramResult; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; @@ -55,7 +54,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; -import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; +import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleObjPair; /** * Evaluate a distance function with respect to kNN queries. For each point, the @@ -121,7 +120,7 @@ public class RankingQualityHistogram<O, D extends NumberDistance<D, ?>> extends // sort neighbors for(Cluster<?> clus : split) { for(DBID i1 : clus.getIDs()) { - List<DistanceResultPair<D>> knn = knnQuery.getKNNForDBID(i1, relation.size()); + KNNResult<D> knn = knnQuery.getKNNForDBID(i1, relation.size()); double result = ROC.computeROCAUCDistanceResult(relation.size(), clus, knn); mv.put(result); @@ -138,8 +137,8 @@ public class RankingQualityHistogram<O, D extends NumberDistance<D, ?>> extends // Transform Histogram into a Double Vector array. Collection<DoubleVector> res = new ArrayList<DoubleVector>(relation.size()); - for(Pair<Double, Double> pair : hist) { - DoubleVector row = new DoubleVector(new double[] { pair.getFirst(), pair.getSecond() }); + for(DoubleObjPair<Double> pair : hist) { + DoubleVector row = new DoubleVector(new double[] { pair.first, pair.getSecond() }); res.add(row); } HistogramResult<DoubleVector> result = new HistogramResult<DoubleVector>("Ranking Quality Histogram", "ranking-histogram", res); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/package-info.java index 3928f01a..e706d586 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/package-info.java @@ -8,7 +8,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team |