From ace5fa7f57d49756c0e1b111a30f3b6a9436c1cb Mon Sep 17 00:00:00 2001 From: Andrej Shadura Date: Sat, 9 Mar 2019 22:30:33 +0000 Subject: Import Upstream version 0.5.0 --- ...s.elki.algorithm.AbstractDistanceBasedAlgorithm | 4 + ...gorithm.AbstractPrimitiveDistanceBasedAlgorithm | 1 + .../elki/de.lmu.ifi.dbs.elki.algorithm.Algorithm | 15 +- ...s.elki.algorithm.clustering.ClusteringAlgorithm | 5 + ...elki.algorithm.clustering.gdbscan.CorePredicate | 1 + ....algorithm.clustering.gdbscan.NeighborPredicate | 1 + ...lgorithm.clustering.kmeans.KMeansInitialization | 1 + ...orithm.clustering.kmeans.KMedoidsInitialization | 4 + ...clustering.subspace.SubspaceClusteringAlgorithm | 4 + ...ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm | 9 +- .../elki/de.lmu.ifi.dbs.elki.data.NumberVector | 1 + .../de.lmu.ifi.dbs.elki.data.SparseNumberVector | 2 + ...lmu.ifi.dbs.elki.datasource.filter.ObjectFilter | 6 +- ...lmu.ifi.dbs.elki.datasource.filter.StreamFilter | 10 + .../de.lmu.ifi.dbs.elki.datasource.parser.Parser | 8 +- ...elki.distance.distancefunction.DistanceFunction | 4 + ...i.dbs.elki.distance.distancefunction.DoubleNorm | 6 +- ...istance.distancefunction.LPNormDistanceFunction | 5 + ...ance.distancefunction.PrimitiveDistanceFunction | 4 + ...istancefunction.PrimitiveDoubleDistanceFunction | 4 + ...stancefunction.SpatialPrimitiveDistanceFunction | 4 +- .../elki/de.lmu.ifi.dbs.elki.evaluation.Evaluator | 9 +- .../elki/de.lmu.ifi.dbs.elki.index.IndexFactory | 1 + .../elki/de.lmu.ifi.dbs.elki.index.KNNIndex | 3 +- .../elki/de.lmu.ifi.dbs.elki.index.RangeIndex | 3 +- ...dbs.elki.math.linearalgebra.pca.EigenPairFilter | 1 + ...u.ifi.dbs.elki.math.linearalgebra.pca.PCARunner | 3 + ...bs.elki.math.statistics.tests.GoodnessOfFitTest | 2 + ...i.dbs.elki.visualization.visualizers.VisFactory | 2 +- src/de/lmu/ifi/dbs/elki/algorithm/APRIORI.java | 8 +- src/de/lmu/ifi/dbs/elki/algorithm/Algorithm.java | 5 +- .../dbs/elki/algorithm/DependencyDerivator.java | 2 +- .../lmu/ifi/dbs/elki/algorithm/DummyAlgorithm.java | 8 +- .../ifi/dbs/elki/algorithm/KNNDistanceOrder.java | 6 +- src/de/lmu/ifi/dbs/elki/algorithm/KNNJoin.java | 8 +- .../dbs/elki/algorithm/MaterializeDistances.java | 11 +- .../lmu/ifi/dbs/elki/algorithm/NullAlgorithm.java | 2 +- .../clustering/AbstractProjectedClustering.java | 3 +- .../clustering/AbstractProjectedDBSCAN.java | 56 +- .../algorithm/clustering/ClusteringAlgorithm.java | 2 +- .../ifi/dbs/elki/algorithm/clustering/DBSCAN.java | 41 +- .../ifi/dbs/elki/algorithm/clustering/DeLiClu.java | 8 +- .../lmu/ifi/dbs/elki/algorithm/clustering/EM.java | 66 +- .../ifi/dbs/elki/algorithm/clustering/OPTICS.java | 19 +- .../algorithm/clustering/OPTICSTypeAlgorithm.java | 2 +- .../ifi/dbs/elki/algorithm/clustering/SLINK.java | 37 +- .../elki/algorithm/clustering/SNNClustering.java | 22 +- .../algorithm/clustering/correlation/CASH.java | 40 +- .../algorithm/clustering/correlation/COPAC.java | 9 +- .../algorithm/clustering/correlation/ERiC.java | 4 +- .../algorithm/clustering/correlation/LMCLUS.java | 24 +- .../algorithm/clustering/correlation/ORCLUS.java | 30 +- .../correlation/cash/CASHIntervalSplit.java | 4 +- .../clustering/gdbscan/CorePredicate.java | 80 + .../gdbscan/EpsilonNeighborPredicate.java | 268 ++ .../clustering/gdbscan/GeneralizedDBSCAN.java | 323 ++ .../clustering/gdbscan/MinPtsCorePredicate.java | 178 + .../clustering/gdbscan/NeighborPredicate.java | 94 + .../algorithm/clustering/gdbscan/package-info.java | 43 + .../clustering/kmeans/AbstractKMeans.java | 168 +- .../kmeans/AbstractKMeansInitialization.java | 9 +- .../clustering/kmeans/FirstKInitialMeans.java | 32 +- .../elki/algorithm/clustering/kmeans/KMeans.java | 55 + .../clustering/kmeans/KMeansInitialization.java | 8 +- .../algorithm/clustering/kmeans/KMeansLloyd.java | 8 +- .../clustering/kmeans/KMeansMacQueen.java | 10 +- .../kmeans/KMeansPlusPlusInitialMeans.java | 84 +- .../algorithm/clustering/kmeans/KMediansLloyd.java | 172 + .../algorithm/clustering/kmeans/KMedoidsEM.java | 271 ++ .../clustering/kmeans/KMedoidsInitialization.java | 45 + .../algorithm/clustering/kmeans/KMedoidsPAM.java | 310 ++ .../clustering/kmeans/PAMInitialMeans.java | 187 + .../kmeans/RandomlyChosenInitialMeans.java | 22 +- .../kmeans/RandomlyGeneratedInitialMeans.java | 9 +- .../elki/algorithm/clustering/subspace/CLIQUE.java | 19 +- .../elki/algorithm/clustering/subspace/DiSH.java | 8 +- .../algorithm/clustering/subspace/PROCLUS.java | 58 +- .../elki/algorithm/clustering/subspace/SUBCLU.java | 7 +- .../subspace/SubspaceClusteringAlgorithm.java | 39 + .../clustering/trivial/ByLabelClustering.java | 49 +- .../trivial/ByLabelHierarchicalClustering.java | 61 +- .../trivial/ByLabelOrAllInOneClustering.java | 74 + .../clustering/trivial/ByModelClustering.java | 8 +- .../lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java | 142 +- .../lmu/ifi/dbs/elki/algorithm/outlier/ALOCI.java | 724 ++++ .../outlier/AbstractAggarwalYuOutlier.java | 14 +- .../elki/algorithm/outlier/AbstractDBOutlier.java | 5 +- .../algorithm/outlier/AggarwalYuEvolutionary.java | 10 +- .../elki/algorithm/outlier/AggarwalYuNaive.java | 14 +- .../elki/algorithm/outlier/DBOutlierDetection.java | 31 +- .../dbs/elki/algorithm/outlier/DBOutlierScore.java | 8 +- .../ifi/dbs/elki/algorithm/outlier/EMOutlier.java | 59 +- .../dbs/elki/algorithm/outlier/GaussianModel.java | 18 +- .../algorithm/outlier/GaussianUniformMixture.java | 21 +- .../lmu/ifi/dbs/elki/algorithm/outlier/HilOut.java | 988 +++++ .../lmu/ifi/dbs/elki/algorithm/outlier/INFLO.java | 36 +- .../ifi/dbs/elki/algorithm/outlier/KNNOutlier.java | 8 +- .../elki/algorithm/outlier/KNNWeightOutlier.java | 8 +- .../lmu/ifi/dbs/elki/algorithm/outlier/LDOF.java | 37 +- .../lmu/ifi/dbs/elki/algorithm/outlier/LOCI.java | 33 +- src/de/lmu/ifi/dbs/elki/algorithm/outlier/LOF.java | 55 +- .../lmu/ifi/dbs/elki/algorithm/outlier/LoOP.java | 43 +- .../ifi/dbs/elki/algorithm/outlier/OPTICSOF.java | 47 +- .../lmu/ifi/dbs/elki/algorithm/outlier/OUTRES.java | 368 -- .../ifi/dbs/elki/algorithm/outlier/OnlineLOF.java | 39 +- .../elki/algorithm/outlier/OutlierAlgorithm.java | 2 +- .../outlier/ReferenceBasedOutlierDetection.java | 26 +- src/de/lmu/ifi/dbs/elki/algorithm/outlier/SOD.java | 469 --- .../outlier/meta/ExternalDoubleOutlierScore.java | 10 +- .../algorithm/outlier/meta/FeatureBagging.java | 27 +- .../ifi/dbs/elki/algorithm/outlier/meta/HiCS.java | 633 ++++ .../outlier/meta/RescaleMetaOutlierAlgorithm.java | 13 +- .../spatial/CTLuGLSBackwardSearchAlgorithm.java | 13 +- .../spatial/CTLuMeanMultipleAttributes.java | 7 +- .../outlier/spatial/CTLuMedianAlgorithm.java | 13 +- .../spatial/CTLuMedianMultipleAttributes.java | 11 +- .../spatial/CTLuMoranScatterplotOutlier.java | 10 +- .../outlier/spatial/CTLuRandomWalkEC.java | 4 +- .../outlier/spatial/CTLuScatterplotOutlier.java | 13 +- .../outlier/spatial/CTLuZTestOutlier.java | 14 +- .../dbs/elki/algorithm/outlier/spatial/SLOM.java | 20 +- .../dbs/elki/algorithm/outlier/spatial/SOF.java | 15 +- .../outlier/spatial/TrimmedMeanApproach.java | 14 +- .../spatial/neighborhood/ExtendedNeighborhood.java | 11 +- .../spatial/neighborhood/ExternalNeighborhood.java | 4 +- .../PrecomputedKNearestNeighborNeighborhood.java | 4 +- .../LinearWeightedExtendedNeighborhood.java | 7 +- .../weighted/UnweightedNeighborhoodAdapter.java | 4 +- .../elki/algorithm/outlier/subspace/OUTRES.java | 428 +++ .../elki/algorithm/outlier/subspace/OutRankS1.java | 199 + .../dbs/elki/algorithm/outlier/subspace/SOD.java | 479 +++ .../algorithm/outlier/subspace/package-info.java | 28 + .../algorithm/outlier/trivial/ByLabelOutlier.java | 15 +- .../outlier/trivial/TrivialAllOutlier.java | 6 +- .../outlier/trivial/TrivialGeneratedOutlier.java | 14 +- .../outlier/trivial/TrivialNoOutlier.java | 8 +- .../elki/algorithm/statistics/AddSingleScale.java | 97 + .../algorithm/statistics/AveragePrecisionAtK.java | 6 +- .../statistics/DistanceStatisticsWithClasses.java | 90 +- .../statistics/EvaluateRankingQuality.java | 13 +- .../statistics/RankingQualityHistogram.java | 8 +- .../ifi/dbs/elki/application/GeneratorXMLSpec.java | 4 +- .../cache/CacheDoubleDistanceInOnDiskMatrix.java | 10 +- .../cache/CacheFloatDistanceInOnDiskMatrix.java | 10 +- .../greedyensemble/ComputeKNNOutlierScores.java | 8 +- .../greedyensemble/GreedyEnsembleExperiment.java | 27 +- .../VisualizePairwiseGainMatrix.java | 7 +- .../application/internal/CheckELKIServices.java | 2 +- .../dbs/elki/application/jsmap/JSONWebServer.java | 15 +- .../application/visualization/KNNExplorer.java | 9 +- src/de/lmu/ifi/dbs/elki/data/Cluster.java | 16 +- src/de/lmu/ifi/dbs/elki/data/Clustering.java | 5 +- src/de/lmu/ifi/dbs/elki/data/NumberVector.java | 1 - .../lmu/ifi/dbs/elki/data/SparseDoubleVector.java | 341 ++ .../lmu/ifi/dbs/elki/data/SparseFloatVector.java | 35 +- .../lmu/ifi/dbs/elki/data/SparseNumberVector.java | 24 +- src/de/lmu/ifi/dbs/elki/data/VectorUtil.java | 119 + .../data/model/CorrelationAnalysisSolution.java | 28 +- .../lmu/ifi/dbs/elki/data/model/MedoidModel.java | 76 + .../data/projection/AbstractFeatureSelection.java | 64 - .../dbs/elki/data/projection/FeatureSelection.java | 14 +- .../data/projection/NumericalFeatureSelection.java | 44 +- .../elki/data/synthetic/bymodel/GeneratorMain.java | 19 +- .../synthetic/bymodel/GeneratorSingleCluster.java | 15 +- .../data/type/NoSupportedDataTypeException.java | 29 +- src/de/lmu/ifi/dbs/elki/data/type/TypeUtil.java | 41 + .../elki/data/type/VectorFieldTypeInformation.java | 12 +- .../dbs/elki/data/type/VectorTypeInformation.java | 1 + .../ifi/dbs/elki/database/AbstractDatabase.java | 18 +- .../lmu/ifi/dbs/elki/database/HashmapDatabase.java | 10 +- .../ifi/dbs/elki/database/datastore/DataStore.java | 5 +- .../elki/database/datastore/DataStoreFactory.java | 32 + .../elki/database/datastore/DataStoreIDMap.java | 4 +- .../dbs/elki/database/datastore/DataStoreUtil.java | 35 + .../elki/database/datastore/DoubleDataStore.java | 14 +- .../elki/database/datastore/IntegerDataStore.java | 50 + .../dbs/elki/database/datastore/RangeIDMap.java | 4 +- .../elki/database/datastore/WritableDataStore.java | 8 +- .../datastore/WritableDoubleDataStore.java | 15 +- .../datastore/WritableIntegerDataStore.java | 64 + .../database/datastore/WritableRecordStore.java | 4 +- .../datastore/memory/ArrayDoubleStore.java | 32 +- .../datastore/memory/ArrayIntegerStore.java | 137 + .../datastore/memory/ArrayRecordStore.java | 14 +- .../elki/database/datastore/memory/ArrayStore.java | 8 +- .../memory/MapIntegerDBIDDoubleStore.java | 31 +- .../memory/MapIntegerDBIDIntegerStore.java | 109 + .../memory/MapIntegerDBIDRecordStore.java | 14 +- .../datastore/memory/MapIntegerDBIDStore.java | 8 +- .../database/datastore/memory/MapRecordStore.java | 20 +- .../elki/database/datastore/memory/MapStore.java | 14 +- .../datastore/memory/MemoryDataStoreFactory.java | 41 + .../lmu/ifi/dbs/elki/database/ids/ArrayDBIDs.java | 4 +- .../elki/database/ids/ArrayModifiableDBIDs.java | 8 + src/de/lmu/ifi/dbs/elki/database/ids/DBID.java | 64 +- .../lmu/ifi/dbs/elki/database/ids/DBIDFactory.java | 8 +- src/de/lmu/ifi/dbs/elki/database/ids/DBIDIter.java | 41 +- .../lmu/ifi/dbs/elki/database/ids/DBIDMIter.java | 38 + .../lmu/ifi/dbs/elki/database/ids/DBIDRange.java | 3 +- src/de/lmu/ifi/dbs/elki/database/ids/DBIDRef.java | 92 + src/de/lmu/ifi/dbs/elki/database/ids/DBIDUtil.java | 52 +- src/de/lmu/ifi/dbs/elki/database/ids/DBIDs.java | 30 +- .../lmu/ifi/dbs/elki/database/ids/EmptyDBIDs.java | 25 +- .../ifi/dbs/elki/database/ids/ModifiableDBIDs.java | 23 +- .../elki/database/ids/generic/DBIDIterAdapter.java | 30 +- .../ids/generic/GenericArrayModifiableDBIDs.java | 28 +- .../ids/generic/GenericHashSetModifiableDBIDs.java | 27 +- .../dbs/elki/database/ids/generic/MaskedDBIDs.java | 35 +- .../dbs/elki/database/ids/generic/MergedDBIDs.java | 3 +- .../ids/generic/UnmodifiableArrayDBIDs.java | 101 + .../database/ids/generic/UnmodifiableDBIDs.java | 4 +- .../ids/integer/IntegerArrayStaticDBIDs.java | 24 +- .../dbs/elki/database/ids/integer/IntegerDBID.java | 51 +- .../database/ids/integer/IntegerDBIDRange.java | 22 +- .../database/ids/integer/SimpleDBIDFactory.java | 3 +- .../database/ids/integer/TrivialDBIDFactory.java | 3 +- .../elki/database/ids/integer/TroveArrayDBIDs.java | 38 +- .../ids/integer/TroveArrayModifiableDBIDs.java | 10 +- .../ids/integer/TroveHashSetModifiableDBIDs.java | 24 +- .../elki/database/query/DistanceDBIDResult.java | 41 + .../elki/database/query/DistanceResultPair.java | 10 +- .../database/query/DoubleDistanceResultPair.java | 20 +- .../database/query/GenericDistanceDBIDList.java | 68 + .../database/query/GenericDistanceResultPair.java | 18 +- .../distance/AbstractDatabaseDistanceQuery.java | 5 +- .../query/distance/AbstractDistanceQuery.java | 8 +- .../database/query/distance/DBIDDistanceQuery.java | 3 +- .../database/query/distance/DistanceQuery.java | 8 +- .../query/distance/PrimitiveDistanceQuery.java | 8 +- .../distance/PrimitiveDistanceSimilarityQuery.java | 8 +- .../query/knn/AbstractDistanceKNNQuery.java | 4 +- .../ifi/dbs/elki/database/query/knn/KNNQuery.java | 3 +- .../ifi/dbs/elki/database/query/knn/KNNResult.java | 5 +- .../ifi/dbs/elki/database/query/knn/KNNUtil.java | 24 +- .../database/query/knn/LinearScanKNNQuery.java | 28 +- .../knn/LinearScanPrimitiveDistanceKNNQuery.java | 11 +- .../knn/LinearScanRawDoubleDistanceKNNQuery.java | 17 +- .../database/query/knn/PreprocessorKNNQuery.java | 12 +- .../query/range/AbstractDistanceRangeQuery.java | 10 +- .../LinearScanPrimitiveDistanceRangeQuery.java | 9 +- .../database/query/range/LinearScanRangeQuery.java | 28 +- .../LinearScanRawDoubleDistanceRangeQuery.java | 28 +- .../dbs/elki/database/query/range/RangeQuery.java | 12 +- .../database/query/rknn/AbstractRKNNQuery.java | 4 +- .../database/query/rknn/LinearScanRKNNQuery.java | 35 +- .../database/query/rknn/PreprocessorRKNNQuery.java | 9 +- .../dbs/elki/database/query/rknn/RKNNQuery.java | 4 +- .../similarity/AbstractDBIDSimilarityQuery.java | 6 +- .../query/similarity/AbstractSimilarityQuery.java | 8 +- .../query/similarity/PrimitiveSimilarityQuery.java | 8 +- .../database/query/similarity/SimilarityQuery.java | 8 +- .../database/relation/ConvertToStringView.java | 12 +- .../ifi/dbs/elki/database/relation/DBIDView.java | 19 +- .../database/relation/MaterializedRelation.java | 15 +- .../dbs/elki/database/relation/ProjectedView.java | 12 +- .../ifi/dbs/elki/database/relation/ProxyView.java | 17 +- .../ifi/dbs/elki/database/relation/Relation.java | 32 +- .../datasource/ArrayAdapterDatabaseConnection.java | 53 + .../datasource/GeneratorXMLDatabaseConnection.java | 50 +- .../AbstractRandomFeatureSelectionFilter.java | 32 +- .../filter/DoubleVectorRandomProjectionFilter.java | 7 +- .../ifi/dbs/elki/datasource/filter/FilterUtil.java | 63 + .../datasource/filter/NoMissingValuesFilter.java | 44 +- .../filter/RandomSamplingStreamFilter.java | 139 + .../filter/SparseFloatVectorProjectionFilter.java | 88 - .../SparseFloatVectorRandomProjectionFilter.java | 82 - .../filter/SparseNumberVectorProjectionFilter.java | 86 + .../SparseNumberVectorRandomProjectionFilter.java | 83 + .../datasource/filter/SparseVectorFieldFilter.java | 22 +- .../normalization/AbstractNormalization.java | 4 +- .../normalization/AbstractStreamNormalization.java | 68 + .../InverseDocumentFrequencyNormalization.java | 27 +- .../filter/normalization/LengthNormalization.java | 2 +- .../filter/normalization/TFIDFNormalization.java | 12 +- .../GlobalPrincipalComponentAnalysisTransform.java | 135 + .../datasource/filter/transform/package-info.java | 26 + .../dbs/elki/datasource/parser/AbstractParser.java | 14 +- .../datasource/parser/DoubleVectorLabelParser.java | 5 +- .../datasource/parser/FloatVectorLabelParser.java | 3 + .../datasource/parser/NumberVectorLabelParser.java | 68 +- .../ParameterizationFunctionLabelParser.java | 141 - .../parser/SparseFloatVectorLabelParser.java | 102 +- .../parser/SparseNumberVectorLabelParser.java | 185 + .../AbstractDBIDDistanceFunction.java | 3 +- .../distancefunction/DBIDDistanceFunction.java | 3 +- .../LocallyWeightedDistanceFunction.java | 4 +- .../distance/distancefunction/MinKDistance.java | 4 +- .../distancefunction/ProxyDistanceFunction.java | 4 +- .../RandomStableDistanceFunction.java | 8 +- ...aredNearestNeighborJaccardDistanceFunction.java | 43 +- .../SparseEuclideanDistanceFunction.java | 109 + .../SparseLPNormDistanceFunction.java | 162 + .../SparseManhattanDistanceFunction.java | 106 + .../SparseMaximumDistanceFunction.java | 106 + .../adapter/AbstractSimilarityAdapter.java | 4 +- .../correlation/ERiCDistanceFunction.java | 4 +- .../PCABasedCorrelationDistanceFunction.java | 16 +- .../DiskCacheBasedDoubleDistanceFunction.java | 4 +- .../DiskCacheBasedFloatDistanceFunction.java | 4 +- .../external/FileBasedDoubleDistanceFunction.java | 4 +- .../external/FileBasedFloatDistanceFunction.java | 8 +- .../external/NumberDistanceParser.java | 18 +- ...enceVectorBasedCorrelationDistanceFunction.java | 3 +- .../subspace/LocalSubspaceDistanceFunction.java | 4 +- .../distance/distancevalue/DoubleDistance.java | 12 + .../elki/distance/distancevalue/FloatDistance.java | 12 + ...nalSharedNearestNeighborSimilarityFunction.java | 30 +- .../SharedNearestNeighborSimilarityFunction.java | 30 +- .../dbs/elki/evaluation/AutomaticEvaluation.java | 159 + .../dbs/elki/evaluation/NoAutomaticEvaluation.java | 56 + .../ifi/dbs/elki/evaluation/clustering/BCubed.java | 1 - .../clustering/ClusterContingencyTable.java | 15 +- .../dbs/elki/evaluation/clustering/Entropy.java | 6 + .../evaluation/clustering/EvaluateClustering.java | 26 +- .../pairsegments/ClusterPairSegmentAnalysis.java | 22 + .../clustering/pairsegments/Segment.java | 22 + .../clustering/pairsegments/Segments.java | 29 +- .../histogram/ComputeOutlierHistogram.java | 66 +- .../evaluation/outlier/JudgeOutlierScores.java | 10 +- .../outlier/OutlierPrecisionAtKCurve.java | 227 ++ .../outlier/OutlierPrecisionRecallCurve.java | 238 ++ .../elki/evaluation/outlier/OutlierROCCurve.java | 241 ++ .../elki/evaluation/outlier/OutlierSmROCCurve.java | 269 ++ .../outlier/OutlierThresholdClustering.java | 175 + .../dbs/elki/evaluation/roc/ComputeROCCurve.java | 183 +- src/de/lmu/ifi/dbs/elki/evaluation/roc/ROC.java | 198 +- .../ComputeSimilarityMatrixImage.java | 40 +- .../configurator/ClassParameterConfigurator.java | 2 +- .../configurator/EnumParameterConfigurator.java | 2 +- .../configurator/TextParameterConfigurator.java | 2 +- src/de/lmu/ifi/dbs/elki/gui/minigui/MiniGUI.java | 2 +- .../ifi/dbs/elki/gui/util/DynamicParameters.java | 4 +- src/de/lmu/ifi/dbs/elki/gui/util/LogPane.java | 6 +- .../lmu/ifi/dbs/elki/gui/util/ParameterTable.java | 23 +- .../lmu/ifi/dbs/elki/gui/util/ParametersModel.java | 12 +- .../ifi/dbs/elki/index/AbstractRefiningIndex.java | 35 +- .../index/preprocessed/LocalProjectionIndex.java | 6 +- .../knn/AbstractMaterializeKNNPreprocessor.java | 8 +- .../knn/MaterializeKNNAndRKNNPreprocessor.java | 51 +- .../knn/MaterializeKNNPreprocessor.java | 42 +- ...dexApproximationMaterializeKNNPreprocessor.java | 11 +- ...ionApproximationMaterializeKNNPreprocessor.java | 18 +- .../knn/RandomSampleKNNPreprocessor.java | 13 +- ...ialApproximationMaterializeKNNPreprocessor.java | 11 +- .../localpca/AbstractFilteredPCAIndex.java | 7 +- .../localpca/FilteredLocalPCAIndex.java | 4 +- .../localpca/RangeQueryFilteredPCAIndex.java | 6 +- .../preference/AbstractPreferenceVectorIndex.java | 4 +- .../preference/DiSHPreferenceVectorIndex.java | 13 +- .../preference/HiSCPreferenceVectorIndex.java | 7 +- .../preference/PreferenceVectorIndex.java | 6 +- .../snn/SharedNearestNeighborIndex.java | 6 +- .../snn/SharedNearestNeighborPreprocessor.java | 17 +- .../AbstractSubspaceProjectionIndex.java | 26 +- .../subspaceproj/FourCSubspaceIndex.java | 6 +- .../subspaceproj/PreDeConSubspaceIndex.java | 7 +- .../subspaceproj/SubspaceProjectionIndex.java | 4 +- .../tree/metrical/mtreevariants/AbstractMTree.java | 11 +- .../mtreevariants/mktrees/AbstractMkTree.java | 4 +- .../mtreevariants/mktrees/mkapp/MkAppTree.java | 9 +- .../mktrees/mkapp/MkAppTreeIndex.java | 4 +- .../mktrees/mkapp/PolynomialApproximation.java | 2 +- .../mtreevariants/mktrees/mkcop/MkCoPTree.java | 13 +- .../mktrees/mkcop/MkCoPTreeIndex.java | 4 +- .../mtreevariants/mktrees/mkmax/MkMaxTree.java | 9 +- .../mktrees/mkmax/MkMaxTreeIndex.java | 4 +- .../mtreevariants/mktrees/mktab/MkTabEntry.java | 2 +- .../mtreevariants/mktrees/mktab/MkTabTree.java | 5 +- .../mktrees/mktab/MkTabTreeIndex.java | 4 +- .../metrical/mtreevariants/mtree/MTreeIndex.java | 4 +- .../mtreevariants/query/MetricalIndexKNNQuery.java | 3 +- .../query/MetricalIndexRangeQuery.java | 10 +- .../mtreevariants/query/MkTreeRKNNQuery.java | 4 +- .../spatial/rstarvariants/AbstractRStarTree.java | 8 +- .../rstarvariants/deliclu/DeLiCluEntry.java | 2 +- .../rstarvariants/deliclu/DeLiCluLeafEntry.java | 2 +- .../rstarvariants/deliclu/DeLiCluTreeIndex.java | 13 +- .../query/DoubleDistanceRStarTreeKNNQuery.java | 14 +- .../query/DoubleDistanceRStarTreeRangeQuery.java | 15 +- .../query/GenericRStarTreeKNNQuery.java | 16 +- .../query/GenericRStarTreeRangeQuery.java | 15 +- .../rstarvariants/rstar/RStarTreeIndex.java | 13 +- ...ApproximativeLeastOverlapInsertionStrategy.java | 2 +- src/de/lmu/ifi/dbs/elki/index/vafile/DAFile.java | 111 + .../ifi/dbs/elki/index/vafile/PartialVAFile.java | 838 +++++ src/de/lmu/ifi/dbs/elki/index/vafile/VAFile.java | 28 +- .../lmu/ifi/dbs/elki/logging/CLISmartHandler.java | 5 +- .../lmu/ifi/dbs/elki/logging/ErrorFormatter.java | 90 + .../ifi/dbs/elki/logging/LoggingConfiguration.java | 21 + src/de/lmu/ifi/dbs/elki/math/MathUtil.java | 21 +- .../lmu/ifi/dbs/elki/math/geometry/AlphaShape.java | 3 + .../elki/math/geometry/GrahamScanConvexHull2D.java | 2 + .../elki/math/geometry/SweepHullDelaunay2D.java | 4 + src/de/lmu/ifi/dbs/elki/math/geometry/XYCurve.java | 418 +++ .../elki/math/histograms/AggregatingHistogram.java | 2 +- .../dbs/elki/math/histograms/FlexiHistogram.java | 2 +- .../elki/math/histograms/ReplacingHistogram.java | 2 +- .../ifi/dbs/elki/math/linearalgebra/Centroid.java | 13 +- .../elki/math/linearalgebra/CovarianceMatrix.java | 13 +- .../math/linearalgebra/LinearEquationSystem.java | 2 +- .../ifi/dbs/elki/math/linearalgebra/Matrix.java | 1 - .../elki/math/linearalgebra/ProjectedCentroid.java | 13 +- .../linearalgebra/pca/DropEigenPairFilter.java | 146 + .../math/linearalgebra/pca/FilteredEigenPairs.java | 4 +- .../pca/PCAFilteredAutotuningRunner.java | 4 +- .../math/linearalgebra/pca/PCAFilteredResult.java | 60 +- .../dbs/elki/math/linearalgebra/pca/PCAResult.java | 25 +- .../pca/SignificantEigenPairFilter.java | 3 - .../pca/WeightedCovarianceMatrixBuilder.java | 12 +- src/de/lmu/ifi/dbs/elki/math/scales/Scales.java | 6 +- .../spacefillingcurves/HilbertSpatialSorter.java | 2 +- .../dbs/elki/math/spacefillingcurves/ZCurve.java | 264 -- .../math/spacefillingcurves/ZCurveTransformer.java | 124 + .../statistics/distribution/BetaDistribution.java | 499 +++ .../statistics/distribution/ChiDistribution.java | 17 +- .../distribution/ChiSquaredDistribution.java | 23 +- .../distribution/ConstantDistribution.java | 7 +- .../math/statistics/distribution/Distribution.java | 20 +- .../distribution/DistributionWithRandom.java | 37 + .../statistics/distribution/GammaDistribution.java | 496 ++- .../distribution/NormalDistribution.java | 26 +- .../distribution/PoissonDistribution.java | 411 ++ .../distribution/StudentsTDistribution.java | 90 + .../distribution/UniformDistribution.java | 14 +- .../math/statistics/tests/GoodnessOfFitTest.java | 49 + .../statistics/tests/KolmogorovSmirnovTest.java | 117 + .../dbs/elki/math/statistics/tests/WelchTTest.java | 124 + .../elki/math/statistics/tests/package-info.java | 26 + .../lmu/ifi/dbs/elki/result/KMLOutputHandler.java | 4 +- .../ifi/dbs/elki/result/OrderingFromDataStore.java | 6 +- src/de/lmu/ifi/dbs/elki/result/OrderingResult.java | 5 +- src/de/lmu/ifi/dbs/elki/result/ResultUtil.java | 68 +- src/de/lmu/ifi/dbs/elki/result/ScalesResult.java | 13 +- .../dbs/elki/result/optics/ClusterOrderResult.java | 50 +- .../optics/DoubleDistanceClusterOrderEntry.java | 2 +- .../result/optics/GenericClusterOrderEntry.java | 2 +- .../elki/result/outlier/OrderingFromRelation.java | 6 +- .../ifi/dbs/elki/result/textwriter/TextWriter.java | 15 +- src/de/lmu/ifi/dbs/elki/utilities/Base64.java | 22 + .../lmu/ifi/dbs/elki/utilities/DatabaseUtil.java | 58 +- .../lmu/ifi/dbs/elki/utilities/InspectionUtil.java | 15 +- src/de/lmu/ifi/dbs/elki/utilities/Util.java | 14 +- .../elki/utilities/datastructures/QuickSelect.java | 178 + .../arraylike/ArrayDBIDsAdapter.java | 2 + .../datastructures/arraylike/ArrayLikeUtil.java | 3 + .../datastructures/arraylike/ListArrayAdapter.java | 22 - .../arraylike/SubsetNumberArrayAdapter.java | 95 + .../utilities/datastructures/heap/KNNHeap.java | 6 +- .../utilities/datastructures/heap/KNNList.java | 27 +- .../datastructures/heap/UpdatableHeap.java | 2 + .../datastructures/hierarchy/Hierarchical.java | 7 +- .../datastructures/hierarchy/Hierarchy.java | 7 +- .../hierarchy/HierarchyHashmapList.java | 19 +- .../hierarchy/HierarchyReferenceLists.java | 19 +- .../dbs/elki/utilities/iterator/EmptyIterator.java | 11 +- .../lmu/ifi/dbs/elki/utilities/iterator/Iter.java | 71 + .../elki/utilities/iterator/IterableIterator.java | 43 - .../iterator/IterableIteratorAdapter.java | 110 - .../dbs/elki/utilities/iterator/IterableUtil.java | 55 - .../elki/utilities/iterator/MergedIterator.java | 9 +- .../elki/utilities/iterator/OneItemIterator.java | 7 +- .../utilities/iterator/TypeFilterIterator.java | 7 +- .../elki/utilities/optionhandling/OptionID.java | 8 + .../lmu/ifi/dbs/elki/utilities/package-info.java | 1 - .../lmu/ifi/dbs/elki/utilities/pairs/CTriple.java | 2 +- .../RandomSampleReferencePoints.java | 4 +- .../referencepoints/StarBasedReferencePoints.java | 4 +- .../outlier/HeDESNormalizationOutlierScaling.java | 4 +- .../scaling/outlier/MinusLogGammaScaling.java | 7 +- .../outlier/MinusLogStandardDeviationScaling.java | 7 +- .../MixtureModelOutlierScalingFunction.java | 4 +- .../outlier/MultiplicativeInverseScaling.java | 4 +- .../scaling/outlier/OutlierGammaScaling.java | 4 +- .../scaling/outlier/OutlierLinearScaling.java | 13 +- .../scaling/outlier/OutlierMinusLogScaling.java | 4 +- .../scaling/outlier/OutlierSqrtScaling.java | 4 +- .../outlier/RankingPseudoOutlierScaling.java | 4 +- .../outlier/SigmoidOutlierScalingFunction.java | 4 +- .../outlier/SqrtStandardDeviationScaling.java | 10 +- .../scaling/outlier/StandardDeviationScaling.java | 7 +- .../scaling/outlier/TopKOutlierScaling.java | 14 +- .../dbs/elki/utilities/xml/XMLNodeIterator.java | 14 +- .../elki/utilities/xml/XMLNodeListIterator.java | 14 +- .../dbs/elki/visualization/VisualizationTask.java | 1 - .../dbs/elki/visualization/VisualizerContext.java | 4 +- .../visualization/VisualizerParameterizer.java | 10 +- .../visualization/batikutil/CloneInlineImages.java | 1 - .../visualization/batikutil/NodeAppendChild.java | 5 +- .../visualization/gui/SelectionTableWindow.java | 5 +- .../elki/visualization/gui/detail/DetailView.java | 7 + .../elki/visualization/gui/overview/LayerMap.java | 3 + .../visualization/gui/overview/OverviewPlot.java | 5 +- .../elki/visualization/gui/overview/PlotItem.java | 2 + .../gui/overview/RectangleArranger.java | 12 +- .../projector/HistogramProjector.java | 3 +- .../projector/ParallelPlotFactory.java | 7 +- .../projector/ScatterPlotProjector.java | 3 +- .../visualization/style/ClassStylingPolicy.java | 9 +- .../visualization/style/ClusterStylingPolicy.java | 11 +- .../dbs/elki/visualization/style/StyleLibrary.java | 5 + .../dbs/elki/visualization/style/StyleResult.java | 3 + .../elki/visualization/style/StylingPolicy.java | 6 +- .../elki/visualization/style/classic.properties | 2 +- .../elki/visualization/style/default.properties | 2 +- .../elki/visualization/style/greyscale.properties | 2 +- .../dbs/elki/visualization/style/neon.properties | 2 +- .../visualization/style/presentation.properties | 2 +- .../dbs/elki/visualization/style/print.properties | 85 + .../ifi/dbs/elki/visualization/svg/SVGPlot.java | 33 +- .../dbs/elki/visualization/svg/SVGScoreBar.java | 44 +- .../visualization/svg/SVGSimpleLinearAxis.java | 48 +- .../visualization/visualizers/VisualizerUtil.java | 35 +- .../histogram/ColoredHistogramVisualizer.java | 30 +- .../optics/OPTICSClusterVisualization.java | 33 +- .../optics/OPTICSPlotCutVisualization.java | 5 +- .../optics/OPTICSPlotSelectionVisualization.java | 4 +- .../visualizers/optics/OPTICSPlotVisualizer.java | 5 +- .../optics/OPTICSSteepAreaVisualization.java | 4 +- .../pairsegments/CircleSegmentsVisualizer.java | 25 +- .../pairsegments/SegmentsStylingPolicy.java | 52 +- .../visualizers/parallel/LineVisualization.java | 25 +- .../parallel/ParallelAxisVisualization.java | 7 +- .../cluster/ClusterOutlineVisualization.java | 9 +- .../cluster/ClusterParallelMeanVisualization.java | 11 +- .../parallel/index/RTreeParallelVisualization.java | 4 +- .../selection/SelectionAxisRangeVisualization.java | 7 +- .../selection/SelectionLineVisualization.java | 13 +- .../SelectionToolAxisRangeVisualization.java | 27 +- .../selection/SelectionToolLineVisualization.java | 23 +- .../visualizers/scatterplot/AxisVisualization.java | 5 +- .../scatterplot/MarkerVisualization.java | 23 +- .../scatterplot/PolygonVisualization.java | 13 +- .../scatterplot/ReferencePointsVisualization.java | 3 +- .../scatterplot/ToolBox2DVisualization.java | 6 +- .../scatterplot/TooltipScoreVisualization.java | 19 +- .../scatterplot/TooltipStringVisualization.java | 13 +- .../cluster/ClusterHullVisualization.java | 15 +- .../cluster/ClusterMeanVisualization.java | 55 +- .../cluster/ClusterOrderVisualization.java | 3 +- .../cluster/EMClusterVisualization.java | 5 +- .../scatterplot/cluster/VoronoiVisualization.java | 113 +- .../density/DensityEstimationOverlay.java | 12 +- .../scatterplot/index/TreeMBRVisualization.java | 7 +- .../scatterplot/index/TreeSphereVisualization.java | 7 +- .../scatterplot/outlier/BubbleVisualization.java | 29 +- .../selection/MoveObjectsToolVisualization.java | 16 +- .../SelectionConvexHullVisualization.java | 15 +- .../selection/SelectionCubeVisualization.java | 7 +- .../selection/SelectionDotVisualization.java | 13 +- .../selection/SelectionToolCubeVisualization.java | 64 +- .../selection/SelectionToolDotVisualization.java | 21 +- .../visualizers/thumbs/ThumbnailThread.java | 1 - .../visunproj/ClusterEvaluationVisFactory.java | 2 +- .../visualizers/visunproj/CurveVisFactory.java | 242 -- .../visualizers/visunproj/KeyVisualization.java | 16 +- .../visualizers/visunproj/SettingsVisFactory.java | 3 +- .../visualizers/visunproj/XYCurveVisFactory.java | 206 ++ .../lmu/ifi/dbs/elki/workflow/AlgorithmStep.java | 13 + .../lmu/ifi/dbs/elki/workflow/EvaluationStep.java | 25 +- src/de/lmu/ifi/dbs/elki/workflow/LoggingStep.java | 8 +- src/tutorial/outlier/DistanceStddevOutlier.java | 148 + .../algorithm/AbstractSimpleAlgorithmTest.java | 16 +- .../de/lmu/ifi/dbs/elki/algorithm/TestKNNJoin.java | 9 +- .../algorithm/clustering/TestDBSCANResults.java | 48 + .../algorithm/clustering/TestDeLiCluResults.java | 4 +- .../elki/algorithm/clustering/TestEMResults.java | 4 +- .../clustering/TestSNNClusteringResults.java | 4 +- .../clustering/correlation/TestCASHResults.java | 8 +- .../clustering/correlation/TestORCLUSResults.java | 5 +- .../clustering/kmeans/TestKMeansResults.java | 76 +- .../clustering/subspace/TestPROCLUSResults.java | 15 +- .../ifi/dbs/elki/algorithm/outlier/TestINFLO.java | 4 +- .../dbs/elki/algorithm/outlier/TestOnlineLOF.java | 6 +- .../ifi/dbs/elki/algorithm/outlier/TestSOD.java | 63 - .../dbs/elki/algorithm/outlier/meta/TestHiCS.java | 88 + .../elki/algorithm/outlier/subspace/TestSOD.java | 63 + .../dbs/elki/evaluation/roc/TestComputeROC.java | 10 +- .../ifi/dbs/elki/index/TestIndexStructures.java | 35 +- .../TestMaterializedKNNAndRKNNPreprocessor.java | 4 +- .../dbs/elki/math/TestKernelDensityFitting.java | 4 +- .../elki/math/histograms/TestFlexiHistogram.java | 6 +- .../distribution/AbstractDistributionTest.java | 77 + .../distribution/TestBetaDistribution.java | 3909 ++++++++++++++++++++ .../distribution/TestChiSquaredDistribution.java | 826 +++++ .../distribution/TestGammaDistribution.java | 1304 +++++++ .../distribution/TestNormalDistribution.java | 519 +++ .../datastructures/heap/TestHeapPerformance.java | 5 +- 587 files changed, 25434 insertions(+), 5258 deletions(-) create mode 100644 src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.CorePredicate create mode 100644 src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.NeighborPredicate create mode 100644 src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMedoidsInitialization create mode 100644 src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.SubspaceClusteringAlgorithm create mode 100644 src/META-INF/elki/de.lmu.ifi.dbs.elki.data.SparseNumberVector create mode 100644 src/META-INF/elki/de.lmu.ifi.dbs.elki.datasource.filter.StreamFilter create mode 100644 src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.LPNormDistanceFunction create mode 100644 src/META-INF/elki/de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCARunner create mode 100644 src/META-INF/elki/de.lmu.ifi.dbs.elki.math.statistics.tests.GoodnessOfFitTest create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/CorePredicate.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/GeneralizedDBSCAN.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/NeighborPredicate.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/package-info.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelOrAllInOneClustering.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/outlier/ALOCI.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/outlier/HilOut.java delete mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/outlier/OUTRES.java delete mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/outlier/SOD.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/HiCS.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OUTRES.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OutRankS1.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/package-info.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/statistics/AddSingleScale.java create mode 100644 src/de/lmu/ifi/dbs/elki/data/SparseDoubleVector.java create mode 100644 src/de/lmu/ifi/dbs/elki/data/model/MedoidModel.java delete mode 100644 src/de/lmu/ifi/dbs/elki/data/projection/AbstractFeatureSelection.java create mode 100644 src/de/lmu/ifi/dbs/elki/database/datastore/IntegerDataStore.java create mode 100644 src/de/lmu/ifi/dbs/elki/database/datastore/WritableIntegerDataStore.java create mode 100644 src/de/lmu/ifi/dbs/elki/database/datastore/memory/ArrayIntegerStore.java create mode 100644 src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapIntegerDBIDIntegerStore.java create mode 100644 src/de/lmu/ifi/dbs/elki/database/ids/DBIDMIter.java create mode 100644 src/de/lmu/ifi/dbs/elki/database/ids/DBIDRef.java create mode 100644 src/de/lmu/ifi/dbs/elki/database/ids/generic/UnmodifiableArrayDBIDs.java create mode 100644 src/de/lmu/ifi/dbs/elki/database/query/DistanceDBIDResult.java create mode 100644 src/de/lmu/ifi/dbs/elki/database/query/GenericDistanceDBIDList.java create mode 100644 src/de/lmu/ifi/dbs/elki/datasource/filter/FilterUtil.java create mode 100644 src/de/lmu/ifi/dbs/elki/datasource/filter/RandomSamplingStreamFilter.java delete mode 100644 src/de/lmu/ifi/dbs/elki/datasource/filter/SparseFloatVectorProjectionFilter.java delete mode 100644 src/de/lmu/ifi/dbs/elki/datasource/filter/SparseFloatVectorRandomProjectionFilter.java create mode 100644 src/de/lmu/ifi/dbs/elki/datasource/filter/SparseNumberVectorProjectionFilter.java create mode 100644 src/de/lmu/ifi/dbs/elki/datasource/filter/SparseNumberVectorRandomProjectionFilter.java create mode 100644 src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractStreamNormalization.java create mode 100644 src/de/lmu/ifi/dbs/elki/datasource/filter/transform/GlobalPrincipalComponentAnalysisTransform.java create mode 100644 src/de/lmu/ifi/dbs/elki/datasource/filter/transform/package-info.java delete mode 100644 src/de/lmu/ifi/dbs/elki/datasource/parser/ParameterizationFunctionLabelParser.java create mode 100644 src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java create mode 100644 src/de/lmu/ifi/dbs/elki/distance/distancefunction/SparseEuclideanDistanceFunction.java create mode 100644 src/de/lmu/ifi/dbs/elki/distance/distancefunction/SparseLPNormDistanceFunction.java create mode 100644 src/de/lmu/ifi/dbs/elki/distance/distancefunction/SparseManhattanDistanceFunction.java create mode 100644 src/de/lmu/ifi/dbs/elki/distance/distancefunction/SparseMaximumDistanceFunction.java create mode 100644 src/de/lmu/ifi/dbs/elki/evaluation/AutomaticEvaluation.java create mode 100644 src/de/lmu/ifi/dbs/elki/evaluation/NoAutomaticEvaluation.java create mode 100644 src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierPrecisionAtKCurve.java create mode 100644 src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierPrecisionRecallCurve.java create mode 100644 src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierROCCurve.java create mode 100644 src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierSmROCCurve.java create mode 100644 src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierThresholdClustering.java create mode 100644 src/de/lmu/ifi/dbs/elki/index/vafile/DAFile.java create mode 100644 src/de/lmu/ifi/dbs/elki/index/vafile/PartialVAFile.java create mode 100644 src/de/lmu/ifi/dbs/elki/logging/ErrorFormatter.java create mode 100644 src/de/lmu/ifi/dbs/elki/math/geometry/XYCurve.java create mode 100644 src/de/lmu/ifi/dbs/elki/math/linearalgebra/pca/DropEigenPairFilter.java delete mode 100644 src/de/lmu/ifi/dbs/elki/math/spacefillingcurves/ZCurve.java create mode 100644 src/de/lmu/ifi/dbs/elki/math/spacefillingcurves/ZCurveTransformer.java create mode 100644 src/de/lmu/ifi/dbs/elki/math/statistics/distribution/BetaDistribution.java create mode 100644 src/de/lmu/ifi/dbs/elki/math/statistics/distribution/DistributionWithRandom.java create mode 100644 src/de/lmu/ifi/dbs/elki/math/statistics/distribution/PoissonDistribution.java create mode 100644 src/de/lmu/ifi/dbs/elki/math/statistics/distribution/StudentsTDistribution.java create mode 100644 src/de/lmu/ifi/dbs/elki/math/statistics/tests/GoodnessOfFitTest.java create mode 100644 src/de/lmu/ifi/dbs/elki/math/statistics/tests/KolmogorovSmirnovTest.java create mode 100644 src/de/lmu/ifi/dbs/elki/math/statistics/tests/WelchTTest.java create mode 100644 src/de/lmu/ifi/dbs/elki/math/statistics/tests/package-info.java create mode 100644 src/de/lmu/ifi/dbs/elki/utilities/datastructures/arraylike/SubsetNumberArrayAdapter.java create mode 100644 src/de/lmu/ifi/dbs/elki/utilities/iterator/Iter.java delete mode 100644 src/de/lmu/ifi/dbs/elki/utilities/iterator/IterableIterator.java delete mode 100644 src/de/lmu/ifi/dbs/elki/utilities/iterator/IterableIteratorAdapter.java delete mode 100644 src/de/lmu/ifi/dbs/elki/utilities/iterator/IterableUtil.java create mode 100644 src/de/lmu/ifi/dbs/elki/visualization/style/print.properties delete mode 100644 src/de/lmu/ifi/dbs/elki/visualization/visualizers/visunproj/CurveVisFactory.java create mode 100644 src/de/lmu/ifi/dbs/elki/visualization/visualizers/visunproj/XYCurveVisFactory.java create mode 100644 src/tutorial/outlier/DistanceStddevOutlier.java delete mode 100644 test/de/lmu/ifi/dbs/elki/algorithm/outlier/TestSOD.java create mode 100644 test/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/TestHiCS.java create mode 100644 test/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/TestSOD.java create mode 100644 test/de/lmu/ifi/dbs/elki/math/statistics/distribution/AbstractDistributionTest.java create mode 100644 test/de/lmu/ifi/dbs/elki/math/statistics/distribution/TestBetaDistribution.java create mode 100644 test/de/lmu/ifi/dbs/elki/math/statistics/distribution/TestChiSquaredDistribution.java create mode 100644 test/de/lmu/ifi/dbs/elki/math/statistics/distribution/TestGammaDistribution.java create mode 100644 test/de/lmu/ifi/dbs/elki/math/statistics/distribution/TestNormalDistribution.java diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm b/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm index dd8d4482..69a3aaed 100644 --- a/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm @@ -5,11 +5,14 @@ de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN de.lmu.ifi.dbs.elki.algorithm.clustering.OPTICS de.lmu.ifi.dbs.elki.algorithm.clustering.DeLiClu de.lmu.ifi.dbs.elki.algorithm.clustering.SLINK +de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMedoidsPAM +de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMedoidsEM de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.HiCO de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.HiSC de.lmu.ifi.dbs.elki.algorithm.outlier.ABOD de.lmu.ifi.dbs.elki.algorithm.outlier.DBOutlierDetection de.lmu.ifi.dbs.elki.algorithm.outlier.DBOutlierScore +de.lmu.ifi.dbs.elki.algorithm.outlier.HilOut de.lmu.ifi.dbs.elki.algorithm.outlier.INFLO de.lmu.ifi.dbs.elki.algorithm.outlier.KNNOutlier de.lmu.ifi.dbs.elki.algorithm.outlier.KNNWeightOutlier @@ -22,3 +25,4 @@ de.lmu.ifi.dbs.elki.algorithm.statistics.AveragePrecisionAtK de.lmu.ifi.dbs.elki.algorithm.statistics.RankingQualityHistogram de.lmu.ifi.dbs.elki.algorithm.statistics.DistanceStatisticsWithClasses de.lmu.ifi.dbs.elki.algorithm.statistics.EvaluateRankingQuality +tutorial.outlier.DistanceStddevOutlier \ No newline at end of file diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm b/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm index 375277a4..7ee58e22 100644 --- a/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm @@ -1,3 +1,4 @@ de.lmu.ifi.dbs.elki.algorithm.DependencyDerivator de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeansLloyd de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeansMacQueen +de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMediansLloyd \ No newline at end of file diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.Algorithm b/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.Algorithm index 466a746a..37621d8b 100644 --- a/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.Algorithm +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.Algorithm @@ -2,8 +2,12 @@ de.lmu.ifi.dbs.elki.algorithm.NullAlgorithm de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN de.lmu.ifi.dbs.elki.algorithm.clustering.DeLiClu de.lmu.ifi.dbs.elki.algorithm.clustering.EM +de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.GeneralizedDBSCAN de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeansLloyd de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeansMacQueen +de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMediansLloyd +de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMedoidsPAM +de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMedoidsEM de.lmu.ifi.dbs.elki.algorithm.clustering.OPTICSXi de.lmu.ifi.dbs.elki.algorithm.clustering.OPTICS de.lmu.ifi.dbs.elki.algorithm.clustering.SLINK @@ -26,14 +30,17 @@ de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelHierarchicalClustering de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByModelClustering de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.TrivialAllInOne de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.TrivialAllNoise +de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering de.lmu.ifi.dbs.elki.algorithm.outlier.ABOD de.lmu.ifi.dbs.elki.algorithm.outlier.AggarwalYuEvolutionary de.lmu.ifi.dbs.elki.algorithm.outlier.AggarwalYuNaive +de.lmu.ifi.dbs.elki.algorithm.outlier.ALOCI de.lmu.ifi.dbs.elki.algorithm.outlier.DBOutlierDetection de.lmu.ifi.dbs.elki.algorithm.outlier.DBOutlierScore de.lmu.ifi.dbs.elki.algorithm.outlier.EMOutlier de.lmu.ifi.dbs.elki.algorithm.outlier.GaussianModel de.lmu.ifi.dbs.elki.algorithm.outlier.GaussianUniformMixture +de.lmu.ifi.dbs.elki.algorithm.outlier.HilOut de.lmu.ifi.dbs.elki.algorithm.outlier.INFLO de.lmu.ifi.dbs.elki.algorithm.outlier.KNNOutlier de.lmu.ifi.dbs.elki.algorithm.outlier.KNNWeightOutlier @@ -42,10 +49,11 @@ de.lmu.ifi.dbs.elki.algorithm.outlier.LOCI de.lmu.ifi.dbs.elki.algorithm.outlier.LOF de.lmu.ifi.dbs.elki.algorithm.outlier.LoOP de.lmu.ifi.dbs.elki.algorithm.outlier.OPTICSOF -de.lmu.ifi.dbs.elki.algorithm.outlier.OUTRES de.lmu.ifi.dbs.elki.algorithm.outlier.ReferenceBasedOutlierDetection -de.lmu.ifi.dbs.elki.algorithm.outlier.SOD de.lmu.ifi.dbs.elki.algorithm.outlier.OnlineLOF +de.lmu.ifi.dbs.elki.algorithm.outlier.subspace.OUTRES +de.lmu.ifi.dbs.elki.algorithm.outlier.subspace.OutRankS1 +de.lmu.ifi.dbs.elki.algorithm.outlier.subspace.SOD de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.CTLuGLSBackwardSearchAlgorithm de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.CTLuMeanMultipleAttributes de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.CTLuMedianAlgorithm @@ -59,11 +67,13 @@ de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.SOF de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.TrimmedMeanApproach de.lmu.ifi.dbs.elki.algorithm.outlier.meta.ExternalDoubleOutlierScore de.lmu.ifi.dbs.elki.algorithm.outlier.meta.FeatureBagging +de.lmu.ifi.dbs.elki.algorithm.outlier.meta.HiCS de.lmu.ifi.dbs.elki.algorithm.outlier.meta.RescaleMetaOutlierAlgorithm de.lmu.ifi.dbs.elki.algorithm.outlier.trivial.ByLabelOutlier de.lmu.ifi.dbs.elki.algorithm.outlier.trivial.TrivialAllOutlier de.lmu.ifi.dbs.elki.algorithm.outlier.trivial.TrivialNoOutlier de.lmu.ifi.dbs.elki.algorithm.outlier.trivial.TrivialGeneratedOutlier +de.lmu.ifi.dbs.elki.algorithm.statistics.AddSingleScale de.lmu.ifi.dbs.elki.algorithm.statistics.AveragePrecisionAtK de.lmu.ifi.dbs.elki.algorithm.statistics.EvaluateRankingQuality de.lmu.ifi.dbs.elki.algorithm.statistics.RankingQualityHistogram @@ -74,3 +84,4 @@ de.lmu.ifi.dbs.elki.algorithm.DependencyDerivator de.lmu.ifi.dbs.elki.algorithm.KNNDistanceOrder de.lmu.ifi.dbs.elki.algorithm.KNNJoin de.lmu.ifi.dbs.elki.algorithm.MaterializeDistances +tutorial.outlier.DistanceStddevOutlier \ No newline at end of file diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm b/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm index f429b53e..2ae85aa5 100644 --- a/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm @@ -1,7 +1,11 @@ de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN de.lmu.ifi.dbs.elki.algorithm.clustering.EM +de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.GeneralizedDBSCAN de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeansLloyd de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeansMacQueen +de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMediansLloyd +de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMedoidsPAM +de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMedoidsEM de.lmu.ifi.dbs.elki.algorithm.clustering.OPTICSXi de.lmu.ifi.dbs.elki.algorithm.clustering.SNNClustering de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.CASH @@ -19,3 +23,4 @@ de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelHierarchicalClustering de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByModelClustering de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.TrivialAllInOne de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.TrivialAllNoise +de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering \ No newline at end of file diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.CorePredicate b/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.CorePredicate new file mode 100644 index 00000000..df33be1c --- /dev/null +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.CorePredicate @@ -0,0 +1 @@ +de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.MinPtsCorePredicate \ No newline at end of file diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.NeighborPredicate b/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.NeighborPredicate new file mode 100644 index 00000000..08ed8efa --- /dev/null +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.NeighborPredicate @@ -0,0 +1 @@ +de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.EpsilonNeighborPredicate \ No newline at end of file diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeansInitialization b/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeansInitialization index b5006bd4..5734e7bf 100644 --- a/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeansInitialization +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeansInitialization @@ -2,3 +2,4 @@ de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.RandomlyGeneratedInitialMeans de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.RandomlyChosenInitialMeans de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.FirstKInitialMeans de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeansPlusPlusInitialMeans +de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.PAMInitialMeans \ No newline at end of file diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMedoidsInitialization b/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMedoidsInitialization new file mode 100644 index 00000000..c75b1cf1 --- /dev/null +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMedoidsInitialization @@ -0,0 +1,4 @@ +de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.PAMInitialMeans +de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.RandomlyChosenInitialMeans +de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.FirstKInitialMeans +de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeansPlusPlusInitialMeans \ No newline at end of file diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.SubspaceClusteringAlgorithm b/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.SubspaceClusteringAlgorithm new file mode 100644 index 00000000..c71dd241 --- /dev/null +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.SubspaceClusteringAlgorithm @@ -0,0 +1,4 @@ +de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.CLIQUE +de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.DiSH +de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.PROCLUS +de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.SUBCLU \ No newline at end of file diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm b/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm index 14bb81fa..18ac09f8 100644 --- a/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm @@ -1,11 +1,13 @@ de.lmu.ifi.dbs.elki.algorithm.outlier.ABOD de.lmu.ifi.dbs.elki.algorithm.outlier.AggarwalYuEvolutionary de.lmu.ifi.dbs.elki.algorithm.outlier.AggarwalYuNaive +de.lmu.ifi.dbs.elki.algorithm.outlier.ALOCI de.lmu.ifi.dbs.elki.algorithm.outlier.DBOutlierDetection de.lmu.ifi.dbs.elki.algorithm.outlier.DBOutlierScore de.lmu.ifi.dbs.elki.algorithm.outlier.EMOutlier de.lmu.ifi.dbs.elki.algorithm.outlier.GaussianModel de.lmu.ifi.dbs.elki.algorithm.outlier.GaussianUniformMixture +de.lmu.ifi.dbs.elki.algorithm.outlier.HilOut de.lmu.ifi.dbs.elki.algorithm.outlier.INFLO de.lmu.ifi.dbs.elki.algorithm.outlier.KNNOutlier de.lmu.ifi.dbs.elki.algorithm.outlier.KNNWeightOutlier @@ -14,10 +16,11 @@ de.lmu.ifi.dbs.elki.algorithm.outlier.LOCI de.lmu.ifi.dbs.elki.algorithm.outlier.LOF de.lmu.ifi.dbs.elki.algorithm.outlier.LoOP de.lmu.ifi.dbs.elki.algorithm.outlier.OPTICSOF -de.lmu.ifi.dbs.elki.algorithm.outlier.OUTRES de.lmu.ifi.dbs.elki.algorithm.outlier.ReferenceBasedOutlierDetection -de.lmu.ifi.dbs.elki.algorithm.outlier.SOD de.lmu.ifi.dbs.elki.algorithm.outlier.OnlineLOF +de.lmu.ifi.dbs.elki.algorithm.outlier.subspace.OUTRES +de.lmu.ifi.dbs.elki.algorithm.outlier.subspace.OutRankS1 +de.lmu.ifi.dbs.elki.algorithm.outlier.subspace.SOD de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.CTLuGLSBackwardSearchAlgorithm de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.CTLuMeanMultipleAttributes de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.CTLuMedianAlgorithm @@ -31,8 +34,10 @@ de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.SOF de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.TrimmedMeanApproach de.lmu.ifi.dbs.elki.algorithm.outlier.meta.ExternalDoubleOutlierScore de.lmu.ifi.dbs.elki.algorithm.outlier.meta.FeatureBagging +de.lmu.ifi.dbs.elki.algorithm.outlier.meta.HiCS de.lmu.ifi.dbs.elki.algorithm.outlier.meta.RescaleMetaOutlierAlgorithm de.lmu.ifi.dbs.elki.algorithm.outlier.trivial.ByLabelOutlier de.lmu.ifi.dbs.elki.algorithm.outlier.trivial.TrivialAllOutlier de.lmu.ifi.dbs.elki.algorithm.outlier.trivial.TrivialNoOutlier de.lmu.ifi.dbs.elki.algorithm.outlier.trivial.TrivialGeneratedOutlier +tutorial.outlier.DistanceStddevOutlier \ No newline at end of file diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.data.NumberVector b/src/META-INF/elki/de.lmu.ifi.dbs.elki.data.NumberVector index 9ee14dab..fd4cab91 100644 --- a/src/META-INF/elki/de.lmu.ifi.dbs.elki.data.NumberVector +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.data.NumberVector @@ -5,6 +5,7 @@ de.lmu.ifi.dbs.elki.data.IntegerVector de.lmu.ifi.dbs.elki.data.OneDimensionalDoubleVector de.lmu.ifi.dbs.elki.data.ParameterizationFunction de.lmu.ifi.dbs.elki.data.SparseFloatVector +de.lmu.ifi.dbs.elki.data.SparseDoubleVector # de.lmu.ifi.dbs.elki.math.linearalgebra.Vector # de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid # de.lmu.ifi.dbs.elki.math.linearalgebra.ProjectedCentroid \ No newline at end of file diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.data.SparseNumberVector b/src/META-INF/elki/de.lmu.ifi.dbs.elki.data.SparseNumberVector new file mode 100644 index 00000000..ca27b7b8 --- /dev/null +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.data.SparseNumberVector @@ -0,0 +1,2 @@ +de.lmu.ifi.dbs.elki.data.SparseFloatVector +de.lmu.ifi.dbs.elki.data.SparseDoubleVector \ No newline at end of file diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter b/src/META-INF/elki/de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter index 3c159ed2..fa40e217 100644 --- a/src/META-INF/elki/de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter @@ -6,10 +6,11 @@ de.lmu.ifi.dbs.elki.datasource.filter.ExternalIDFilter de.lmu.ifi.dbs.elki.datasource.filter.FixedDBIDsFilter de.lmu.ifi.dbs.elki.datasource.filter.NoOpFilter de.lmu.ifi.dbs.elki.datasource.filter.NoMissingValuesFilter +de.lmu.ifi.dbs.elki.datasource.filter.RandomSamplingStreamFilter de.lmu.ifi.dbs.elki.datasource.filter.ShuffleObjectsFilter de.lmu.ifi.dbs.elki.datasource.filter.SortByLabelFilter -de.lmu.ifi.dbs.elki.datasource.filter.SparseFloatVectorProjectionFilter -de.lmu.ifi.dbs.elki.datasource.filter.SparseFloatVectorRandomProjectionFilter +de.lmu.ifi.dbs.elki.datasource.filter.SparseNumberVectorProjectionFilter +de.lmu.ifi.dbs.elki.datasource.filter.SparseNumberVectorRandomProjectionFilter de.lmu.ifi.dbs.elki.datasource.filter.SparseVectorFieldFilter de.lmu.ifi.dbs.elki.datasource.filter.SplitNumberVectorFilter de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseErfNormalization @@ -19,3 +20,4 @@ de.lmu.ifi.dbs.elki.datasource.filter.normalization.LengthNormalization de.lmu.ifi.dbs.elki.datasource.filter.normalization.InverseDocumentFrequencyNormalization de.lmu.ifi.dbs.elki.datasource.filter.normalization.RankTieNormalization de.lmu.ifi.dbs.elki.datasource.filter.normalization.TFIDFNormalization +de.lmu.ifi.dbs.elki.datasource.filter.transform.GlobalPrincipalComponentAnalysisTransform \ No newline at end of file diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.datasource.filter.StreamFilter b/src/META-INF/elki/de.lmu.ifi.dbs.elki.datasource.filter.StreamFilter new file mode 100644 index 00000000..178916cd --- /dev/null +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.datasource.filter.StreamFilter @@ -0,0 +1,10 @@ +de.lmu.ifi.dbs.elki.datasource.filter.ByLabelFilter +de.lmu.ifi.dbs.elki.datasource.filter.DoubleVectorProjectionFilter +de.lmu.ifi.dbs.elki.datasource.filter.DoubleVectorRandomProjectionFilter +de.lmu.ifi.dbs.elki.datasource.filter.FixedDBIDsFilter +de.lmu.ifi.dbs.elki.datasource.filter.NoMissingValuesFilter +de.lmu.ifi.dbs.elki.datasource.filter.NoOpFilter +de.lmu.ifi.dbs.elki.datasource.filter.RandomSamplingStreamFilter +de.lmu.ifi.dbs.elki.datasource.filter.SparseNumberVectorProjectionFilter +de.lmu.ifi.dbs.elki.datasource.filter.SparseNumberVectorRandomProjectionFilter +de.lmu.ifi.dbs.elki.datasource.filter.normalization.LengthNormalization \ No newline at end of file diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.datasource.parser.Parser b/src/META-INF/elki/de.lmu.ifi.dbs.elki.datasource.parser.Parser index 4002f056..3d005531 100644 --- a/src/META-INF/elki/de.lmu.ifi.dbs.elki.datasource.parser.Parser +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.datasource.parser.Parser @@ -1,10 +1,10 @@ de.lmu.ifi.dbs.elki.datasource.parser.NumberVectorLabelParser de.lmu.ifi.dbs.elki.datasource.parser.ArffParser -de.lmu.ifi.dbs.elki.datasource.parser.DoubleVectorLabelParser -de.lmu.ifi.dbs.elki.datasource.parser.FloatVectorLabelParser +de.lmu.ifi.dbs.elki.datasource.parser.SparseNumberVectorLabelParser de.lmu.ifi.dbs.elki.datasource.parser.SparseBitVectorLabelParser -de.lmu.ifi.dbs.elki.datasource.parser.SparseFloatVectorLabelParser de.lmu.ifi.dbs.elki.datasource.parser.TermFrequencyParser de.lmu.ifi.dbs.elki.datasource.parser.BitVectorLabelParser -de.lmu.ifi.dbs.elki.datasource.parser.ParameterizationFunctionLabelParser de.lmu.ifi.dbs.elki.datasource.parser.SimplePolygonParser +# deprecated: de.lmu.ifi.dbs.elki.datasource.parser.DoubleVectorLabelParser +# deprecated: de.lmu.ifi.dbs.elki.datasource.parser.FloatVectorLabelParser +# deprecated: de.lmu.ifi.dbs.elki.datasource.parser.SparseFloatVectorLabelParser \ No newline at end of file diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction b/src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction index 45c31e56..249db041 100644 --- a/src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction @@ -16,6 +16,10 @@ de.lmu.ifi.dbs.elki.distance.distancefunction.LocallyWeightedDistanceFunction # de.lmu.ifi.dbs.elki.distance.distancefunction.ProxyDistanceFunction # de.lmu.ifi.dbs.elki.distance.distancefunction.WeightedLPNormDistanceFunction # de.lmu.ifi.dbs.elki.distance.distancefunction.WeightedSquaredEuclideanDistanceFunction +de.lmu.ifi.dbs.elki.distance.distancefunction.SparseEuclideanDistanceFunction +de.lmu.ifi.dbs.elki.distance.distancefunction.SparseManhattanDistanceFunction +de.lmu.ifi.dbs.elki.distance.distancefunction.SparseLPNormDistanceFunction +de.lmu.ifi.dbs.elki.distance.distancefunction.SparseMaximumDistanceFunction de.lmu.ifi.dbs.elki.distance.distancefunction.adapter.SimilarityAdapterArccos de.lmu.ifi.dbs.elki.distance.distancefunction.adapter.SimilarityAdapterLn de.lmu.ifi.dbs.elki.distance.distancefunction.adapter.SimilarityAdapterLinear diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.DoubleNorm b/src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.DoubleNorm index bb3847a9..2714a376 100644 --- a/src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.DoubleNorm +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.DoubleNorm @@ -4,7 +4,11 @@ de.lmu.ifi.dbs.elki.distance.distancefunction.MaximumDistanceFunction de.lmu.ifi.dbs.elki.distance.distancefunction.MinimumDistanceFunction de.lmu.ifi.dbs.elki.distance.distancefunction.LPNormDistanceFunction de.lmu.ifi.dbs.elki.distance.distancefunction.SquaredEuclideanDistanceFunction -de.lmu.ifi.dbs.elki.distance.distancefunction.WeightedLPNormDistanceFunction +# de.lmu.ifi.dbs.elki.distance.distancefunction.WeightedLPNormDistanceFunction +de.lmu.ifi.dbs.elki.distance.distancefunction.SparseEuclideanDistanceFunction +de.lmu.ifi.dbs.elki.distance.distancefunction.SparseManhattanDistanceFunction +de.lmu.ifi.dbs.elki.distance.distancefunction.SparseLPNormDistanceFunction +de.lmu.ifi.dbs.elki.distance.distancefunction.SparseMaximumDistanceFunction de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceLPNormDistanceFunction de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceManhattanDistanceFunction diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.LPNormDistanceFunction b/src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.LPNormDistanceFunction new file mode 100644 index 00000000..4cac929a --- /dev/null +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.LPNormDistanceFunction @@ -0,0 +1,5 @@ +de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction +de.lmu.ifi.dbs.elki.distance.distancefunction.ManhattanDistanceFunction +de.lmu.ifi.dbs.elki.distance.distancefunction.LPNormDistanceFunction +de.lmu.ifi.dbs.elki.distance.distancefunction.MaximumDistanceFunction +# de.lmu.ifi.dbs.elki.distance.distancefunction.WeightedLPNormDistanceFunction \ No newline at end of file diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction b/src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction index bc770e74..9dc29f22 100644 --- a/src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction @@ -11,6 +11,10 @@ de.lmu.ifi.dbs.elki.distance.distancefunction.JeffreyDivergenceDistanceFunction # de.lmu.ifi.dbs.elki.distance.distancefunction.WeightedDistanceFunction # de.lmu.ifi.dbs.elki.distance.distancefunction.WeightedLPNormDistanceFunction # de.lmu.ifi.dbs.elki.distance.distancefunction.WeightedSquaredEuclideanDistanceFunction +de.lmu.ifi.dbs.elki.distance.distancefunction.SparseEuclideanDistanceFunction +de.lmu.ifi.dbs.elki.distance.distancefunction.SparseManhattanDistanceFunction +de.lmu.ifi.dbs.elki.distance.distancefunction.SparseLPNormDistanceFunction +de.lmu.ifi.dbs.elki.distance.distancefunction.SparseMaximumDistanceFunction de.lmu.ifi.dbs.elki.distance.distancefunction.colorhistogram.HSBHistogramQuadraticDistanceFunction de.lmu.ifi.dbs.elki.distance.distancefunction.colorhistogram.HistogramIntersectionDistanceFunction de.lmu.ifi.dbs.elki.distance.distancefunction.colorhistogram.RGBHistogramQuadraticDistanceFunction diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction b/src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction index 3bf5428e..35baef47 100644 --- a/src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction @@ -11,6 +11,10 @@ de.lmu.ifi.dbs.elki.distance.distancefunction.JeffreyDivergenceDistanceFunction # de.lmu.ifi.dbs.elki.distance.distancefunction.WeightedDistanceFunction # de.lmu.ifi.dbs.elki.distance.distancefunction.WeightedLPNormDistanceFunction # de.lmu.ifi.dbs.elki.distance.distancefunction.WeightedSquaredEuclideanDistanceFunction +de.lmu.ifi.dbs.elki.distance.distancefunction.SparseEuclideanDistanceFunction +de.lmu.ifi.dbs.elki.distance.distancefunction.SparseManhattanDistanceFunction +de.lmu.ifi.dbs.elki.distance.distancefunction.SparseLPNormDistanceFunction +de.lmu.ifi.dbs.elki.distance.distancefunction.SparseMaximumDistanceFunction de.lmu.ifi.dbs.elki.distance.distancefunction.colorhistogram.HSBHistogramQuadraticDistanceFunction de.lmu.ifi.dbs.elki.distance.distancefunction.colorhistogram.HistogramIntersectionDistanceFunction de.lmu.ifi.dbs.elki.distance.distancefunction.colorhistogram.RGBHistogramQuadraticDistanceFunction diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.SpatialPrimitiveDistanceFunction b/src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.SpatialPrimitiveDistanceFunction index bc1c6777..01d9789a 100644 --- a/src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.SpatialPrimitiveDistanceFunction +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.distance.distancefunction.SpatialPrimitiveDistanceFunction @@ -7,9 +7,9 @@ de.lmu.ifi.dbs.elki.distance.distancefunction.ArcCosineDistanceFunction de.lmu.ifi.dbs.elki.distance.distancefunction.CosineDistanceFunction de.lmu.ifi.dbs.elki.distance.distancefunction.CanberraDistanceFunction de.lmu.ifi.dbs.elki.distance.distancefunction.SquaredEuclideanDistanceFunction -de.lmu.ifi.dbs.elki.distance.distancefunction.WeightedLPNormDistanceFunction +# de.lmu.ifi.dbs.elki.distance.distancefunction.WeightedLPNormDistanceFunction de.lmu.ifi.dbs.elki.distance.distancefunction.colorhistogram.HistogramIntersectionDistanceFunction de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.DimensionSelectingDistanceFunction de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceLPNormDistanceFunction -de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceManhattanDistanceFunction \ No newline at end of file +de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceManhattanDistanceFunction diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.evaluation.Evaluator b/src/META-INF/elki/de.lmu.ifi.dbs.elki.evaluation.Evaluator index f2813425..1bacadda 100644 --- a/src/META-INF/elki/de.lmu.ifi.dbs.elki.evaluation.Evaluator +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.evaluation.Evaluator @@ -1,8 +1,15 @@ +de.lmu.ifi.dbs.elki.evaluation.AutomaticEvaluation +de.lmu.ifi.dbs.elki.evaluation.NoAutomaticEvaluation de.lmu.ifi.dbs.elki.evaluation.clustering.EvaluateClustering de.lmu.ifi.dbs.elki.evaluation.clustering.pairsegments.ClusterPairSegmentAnalysis -de.lmu.ifi.dbs.elki.evaluation.roc.ComputeROCCurve de.lmu.ifi.dbs.elki.evaluation.histogram.ComputeOutlierHistogram de.lmu.ifi.dbs.elki.evaluation.index.IndexPurity de.lmu.ifi.dbs.elki.evaluation.index.IndexStatistics +de.lmu.ifi.dbs.elki.evaluation.outlier.OutlierROCCurve +de.lmu.ifi.dbs.elki.evaluation.outlier.OutlierThresholdClustering +de.lmu.ifi.dbs.elki.evaluation.outlier.OutlierPrecisionAtKCurve +de.lmu.ifi.dbs.elki.evaluation.outlier.OutlierPrecisionRecallCurve +de.lmu.ifi.dbs.elki.evaluation.outlier.OutlierSmROCCurve de.lmu.ifi.dbs.elki.evaluation.outlier.JudgeOutlierScores de.lmu.ifi.dbs.elki.evaluation.similaritymatrix.ComputeSimilarityMatrixImage +# de.lmu.ifi.dbs.elki.evaluation.roc.ComputeROCCurve \ No newline at end of file diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.index.IndexFactory b/src/META-INF/elki/de.lmu.ifi.dbs.elki.index.IndexFactory index 6c0fff30..50052cb3 100644 --- a/src/META-INF/elki/de.lmu.ifi.dbs.elki.index.IndexFactory +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.index.IndexFactory @@ -8,6 +8,7 @@ de.lmu.ifi.dbs.elki.index.tree.metrical.mtreevariants.mktrees.mkapp.MkAppTreeFac de.lmu.ifi.dbs.elki.index.tree.metrical.mtreevariants.mktrees.mkmax.MkMaxTreeFactory de.lmu.ifi.dbs.elki.index.tree.metrical.mtreevariants.mktrees.mktab.MkTabTreeFactory de.lmu.ifi.dbs.elki.index.vafile.VAFile$Factory +de.lmu.ifi.dbs.elki.index.vafile.PartialVAFile$Factory de.lmu.ifi.dbs.elki.index.preprocessed.knn.MaterializeKNNAndRKNNPreprocessor$Factory de.lmu.ifi.dbs.elki.index.preprocessed.knn.MaterializeKNNPreprocessor$Factory de.lmu.ifi.dbs.elki.index.preprocessed.knn.KNNJoinMaterializeKNNPreprocessor$Factory diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.index.KNNIndex b/src/META-INF/elki/de.lmu.ifi.dbs.elki.index.KNNIndex index cc1e2e56..c3a3fc1b 100644 --- a/src/META-INF/elki/de.lmu.ifi.dbs.elki.index.KNNIndex +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.index.KNNIndex @@ -14,4 +14,5 @@ de.lmu.ifi.dbs.elki.index.preprocessed.knn.MetricalIndexApproximationMaterialize de.lmu.ifi.dbs.elki.index.preprocessed.knn.PartitionApproximationMaterializeKNNPreprocessor de.lmu.ifi.dbs.elki.index.preprocessed.knn.RandomSampleKNNPreprocessor de.lmu.ifi.dbs.elki.index.preprocessed.knn.SpatialApproximationMaterializeKNNPreprocessor -de.lmu.ifi.dbs.elki.index.vafile.VAFile \ No newline at end of file +de.lmu.ifi.dbs.elki.index.vafile.VAFile +de.lmu.ifi.dbs.elki.index.vafile.PartialVAFile \ No newline at end of file diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.index.RangeIndex b/src/META-INF/elki/de.lmu.ifi.dbs.elki.index.RangeIndex index d39d1bbb..ce559a88 100644 --- a/src/META-INF/elki/de.lmu.ifi.dbs.elki.index.RangeIndex +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.index.RangeIndex @@ -7,4 +7,5 @@ de.lmu.ifi.dbs.elki.index.tree.metrical.mtreevariants.mktrees.mkapp.MkAppTreeInd # de.lmu.ifi.dbs.elki.index.tree.metrical.mtreevariants.mktrees.mkcop.MkCoPTreeIndex de.lmu.ifi.dbs.elki.index.tree.metrical.mtreevariants.mktrees.mkmax.MkMaxTreeIndex de.lmu.ifi.dbs.elki.index.tree.metrical.mtreevariants.mktrees.mktab.MkTabTreeIndex -de.lmu.ifi.dbs.elki.index.vafile.VAFile \ No newline at end of file +de.lmu.ifi.dbs.elki.index.vafile.VAFile +de.lmu.ifi.dbs.elki.index.vafile.PartialVAFile \ No newline at end of file diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.math.linearalgebra.pca.EigenPairFilter b/src/META-INF/elki/de.lmu.ifi.dbs.elki.math.linearalgebra.pca.EigenPairFilter index 67050b39..b1e7c8d5 100644 --- a/src/META-INF/elki/de.lmu.ifi.dbs.elki.math.linearalgebra.pca.EigenPairFilter +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.math.linearalgebra.pca.EigenPairFilter @@ -7,3 +7,4 @@ de.lmu.ifi.dbs.elki.math.linearalgebra.pca.WeakEigenPairFilter de.lmu.ifi.dbs.elki.math.linearalgebra.pca.RelativeEigenPairFilter de.lmu.ifi.dbs.elki.math.linearalgebra.pca.SignificantEigenPairFilter de.lmu.ifi.dbs.elki.math.linearalgebra.pca.CompositeEigenPairFilter +de.lmu.ifi.dbs.elki.math.linearalgebra.pca.DropEigenPairFilter \ No newline at end of file diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCARunner b/src/META-INF/elki/de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCARunner new file mode 100644 index 00000000..1f53a991 --- /dev/null +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCARunner @@ -0,0 +1,3 @@ +de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCARunner +# de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAFilteredRunner +de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAFilteredAutotuningRunner \ No newline at end of file diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.math.statistics.tests.GoodnessOfFitTest b/src/META-INF/elki/de.lmu.ifi.dbs.elki.math.statistics.tests.GoodnessOfFitTest new file mode 100644 index 00000000..01391cbd --- /dev/null +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.math.statistics.tests.GoodnessOfFitTest @@ -0,0 +1,2 @@ +de.lmu.ifi.dbs.elki.math.statistics.tests.KolmogorovSmirnovTest +de.lmu.ifi.dbs.elki.math.statistics.tests.WelchTTest \ No newline at end of file diff --git a/src/META-INF/elki/de.lmu.ifi.dbs.elki.visualization.visualizers.VisFactory b/src/META-INF/elki/de.lmu.ifi.dbs.elki.visualization.visualizers.VisFactory index 296024aa..06c0cc41 100644 --- a/src/META-INF/elki/de.lmu.ifi.dbs.elki.visualization.visualizers.VisFactory +++ b/src/META-INF/elki/de.lmu.ifi.dbs.elki.visualization.visualizers.VisFactory @@ -33,7 +33,7 @@ de.lmu.ifi.dbs.elki.visualization.visualizers.parallel.selection.SelectionToolLi de.lmu.ifi.dbs.elki.visualization.visualizers.pairsegments.CircleSegmentsVisualizer$Factory de.lmu.ifi.dbs.elki.visualization.visualizers.visunproj.HistogramVisFactory de.lmu.ifi.dbs.elki.visualization.visualizers.visunproj.ClusterEvaluationVisFactory -de.lmu.ifi.dbs.elki.visualization.visualizers.visunproj.CurveVisFactory +de.lmu.ifi.dbs.elki.visualization.visualizers.visunproj.XYCurveVisFactory de.lmu.ifi.dbs.elki.visualization.visualizers.visunproj.LabelVisFactory de.lmu.ifi.dbs.elki.visualization.visualizers.visunproj.PixmapVisualizer$Factory de.lmu.ifi.dbs.elki.visualization.visualizers.visunproj.SimilarityMatrixVisualizer$Factory diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/APRIORI.java b/src/de/lmu/ifi/dbs/elki/algorithm/APRIORI.java index fc346cd9..65339257 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/APRIORI.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/APRIORI.java @@ -34,7 +34,7 @@ import de.lmu.ifi.dbs.elki.data.BitVector; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.result.AprioriResult; @@ -127,7 +127,7 @@ public class APRIORI extends AbstractAlgorithm { * @param relation the Relation to process * @return the AprioriResult learned by this APRIORI */ - public AprioriResult run(Database database, Relation relation) throws IllegalStateException { + public AprioriResult run(Database database, Relation relation) { Map support = new Hashtable(); List solution = new ArrayList(); final int size = relation.size(); @@ -264,8 +264,8 @@ public class APRIORI extends AbstractAlgorithm { support.put(bitSet, 0); } } - for(DBID id : database.iterDBIDs()) { - BitVector bv = database.get(id); + for(DBIDIter iditer = database.iterDBIDs(); iditer.valid(); iditer.advance()) { + BitVector bv = database.get(iditer); for(BitSet bitSet : candidates) { if(bv.contains(bitSet)) { support.put(bitSet, support.get(bitSet) + 1); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/Algorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/Algorithm.java index 7c6f0dc5..ae221ca7 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/Algorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/Algorithm.java @@ -53,11 +53,8 @@ public interface Algorithm extends Parameterizable { * * @param database the database to run the algorithm on * @return the Result computed by this algorithm - * @throws IllegalStateException if the algorithm has not been initialized - * properly (e.g. the setParameters(String[]) method has been failed - * to be called). */ - Result run(Database database) throws IllegalStateException; + Result run(Database database); /** * Get the input type restriction used for negotiating the data query. diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/DependencyDerivator.java b/src/de/lmu/ifi/dbs/elki/algorithm/DependencyDerivator.java index 0ecfb228..e0eabf5c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/DependencyDerivator.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/DependencyDerivator.java @@ -149,7 +149,7 @@ public class DependencyDerivator, D extends Distanc * @return the CorrelationAnalysisSolution computed by this * DependencyDerivator */ - public CorrelationAnalysisSolution run(Database database, Relation relation) throws IllegalStateException { + public CorrelationAnalysisSolution run(Database database, Relation relation) { if(logger.isVerbose()) { logger.verbose("retrieving database objects..."); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/DummyAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/DummyAlgorithm.java index 168c69f1..64188502 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/DummyAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/DummyAlgorithm.java @@ -27,7 +27,7 @@ import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -80,11 +80,11 @@ public class DummyAlgorithm> extends AbstractAlgori DistanceQuery distQuery = database.getDistanceQuery(relation, EuclideanDistanceFunction.STATIC); KNNQuery knnQuery = database.getKNNQuery(distQuery, 10); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { // Get the actual object from the database (but discard the result) - relation.get(id); + relation.get(iditer); // run a 10NN query for each point (but discard the result) - knnQuery.getKNNForDBID(id, 10); + knnQuery.getKNNForDBID(iditer, 10); } return null; } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/KNNDistanceOrder.java b/src/de/lmu/ifi/dbs/elki/algorithm/KNNDistanceOrder.java index ac1820f9..137ffadf 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/KNNDistanceOrder.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/KNNDistanceOrder.java @@ -31,7 +31,7 @@ import java.util.Random; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult; @@ -115,9 +115,9 @@ public class KNNDistanceOrder> extends AbstractDistance final Random random = new Random(); List knnDistances = new ArrayList(relation.size()); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { if(random.nextDouble() < percentage) { - final KNNResult neighbors = knnQuery.getKNNForDBID(id, k); + final KNNResult neighbors = knnQuery.getKNNForDBID(iditer, k); knnDistances.add(neighbors.getKNNDistance()); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/KNNJoin.java b/src/de/lmu/ifi/dbs/elki/algorithm/KNNJoin.java index 3cbfe143..3eb789c7 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/KNNJoin.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/KNNJoin.java @@ -115,12 +115,12 @@ public class KNNJoin, D extends Distance, N exte /** * Joins in the given spatial database to each object its k-nearest neighbors. * - * @throws IllegalStateException if not suitable {@link SpatialIndexTree} was - * found or the specified distance function is not an instance of - * {@link SpatialPrimitiveDistanceFunction}. + * @param database Database to process + * @param relation Relation to process + * @return result */ @SuppressWarnings("unchecked") - public WritableDataStore> run(Database database, Relation relation) throws IllegalStateException { + public WritableDataStore> run(Database database, Relation relation) { if(!(getDistanceFunction() instanceof SpatialPrimitiveDistanceFunction)) { throw new IllegalStateException("Distance Function must be an instance of " + SpatialPrimitiveDistanceFunction.class.getName()); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/MaterializeDistances.java b/src/de/lmu/ifi/dbs/elki/algorithm/MaterializeDistances.java index 89d2d3e0..b09f7ac2 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/MaterializeDistances.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/MaterializeDistances.java @@ -30,6 +30,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; @@ -77,14 +78,14 @@ public class MaterializeDistances> extends Abs Collection> r = new ArrayList>(size * (size + 1) / 2); - for(DBID id1 : relation.iterDBIDs()) { - for(DBID id2 : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer2 = relation.iterDBIDs(); iditer2.valid(); iditer2.advance()) { // skip inverted pairs - if(id2.compareTo(id1) > 0) { + if(iditer2.compareDBID(iditer) > 0) { continue; } - double d = distFunc.distance(id1, id2).doubleValue(); - r.add(new CTriple(id1, id2, d)); + double d = distFunc.distance(iditer, iditer2).doubleValue(); + r.add(new CTriple(iditer.getDBID(), iditer2.getDBID(), d)); } } return new CollectionResult>("Distance Matrix", "distance-matrix", r); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/NullAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/NullAlgorithm.java index a879c6b2..490d79fb 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/NullAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/NullAlgorithm.java @@ -53,7 +53,7 @@ public class NullAlgorithm extends AbstractAlgorithm { } @Override - public Result run(Database database) throws IllegalStateException { + public Result run(Database database) { return null; } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java index ea441655..670a3f0f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java @@ -27,7 +27,6 @@ import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.PROCLUS; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.QueryUtil; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; @@ -49,7 +48,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; * @param the result we return * @param the type of FeatureVector handled by this Algorithm */ -public abstract class AbstractProjectedClustering, V extends NumberVector> extends AbstractAlgorithm implements ClusteringAlgorithm { +public abstract class AbstractProjectedClustering, V extends NumberVector> extends AbstractAlgorithm implements ClusteringAlgorithm { /** * Parameter to specify the number of clusters to find, must be an integer * greater than 0. diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java index 108ba0ed..250cc70b 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java @@ -37,6 +37,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; @@ -166,7 +167,14 @@ public abstract class AbstractProjectedDBSCAN, V ext this.lambda = lambda; } - public Clustering run(Database database, Relation relation) throws IllegalStateException { + /** + * Run the algorithm + * + * @param database Database to process + * @param relation Relation to process + * @return Clustering result + */ + public Clustering run(Database database, Relation relation) { FiniteProgress objprog = getLogger().isVerbose() ? new FiniteProgress("Processing objects", relation.size(), getLogger()) : null; IndefiniteProgress clusprog = getLogger().isVerbose() ? new IndefiniteProgress("Number of clusters", getLogger()) : null; resultList = new ArrayList(); @@ -177,9 +185,9 @@ public abstract class AbstractProjectedDBSCAN, V ext RangeQuery rangeQuery = database.getRangeQuery(distFunc); if(relation.size() >= minpts) { - for(DBID id : relation.iterDBIDs()) { - if(!processedIDs.contains(id)) { - expandCluster(distFunc, rangeQuery, id, objprog, clusprog); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + if(!processedIDs.contains(iditer)) { + expandCluster(distFunc, rangeQuery, iditer.getDBID(), objprog, clusprog); if(processedIDs.size() == relation.size() && noise.size() == 0) { break; } @@ -191,8 +199,8 @@ public abstract class AbstractProjectedDBSCAN, V ext } } else { - for(DBID id : relation.iterDBIDs()) { - noise.add(id); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + noise.add(iditer); if(objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), getLogger()); clusprog.setProcessed(resultList.size(), getLogger()); @@ -284,28 +292,26 @@ public abstract class AbstractProjectedDBSCAN, V ext // try to expand the cluster ModifiableDBIDs currentCluster = DBIDUtil.newArray(); for(DistanceResultPair seed : seeds) { - DBID nextID = seed.getDBID(); - - Integer nextID_corrDim = distFunc.getIndex().getLocalProjection(nextID).getCorrelationDimension(); + int nextID_corrDim = distFunc.getIndex().getLocalProjection(seed).getCorrelationDimension(); // nextID is not reachable from start object if(nextID_corrDim > lambda) { continue; } - if(!processedIDs.contains(nextID)) { - currentCluster.add(nextID); - processedIDs.add(nextID); + if(!processedIDs.contains(seed)) { + currentCluster.add(seed); + processedIDs.add(seed); } - else if(noise.contains(nextID)) { - currentCluster.add(nextID); - noise.remove(nextID); + else if(noise.contains(seed)) { + currentCluster.add(seed); + noise.remove(seed); } } seeds.remove(0); while(seeds.size() > 0) { - DBID q = seeds.remove(0).getDBID(); - Integer corrDim_q = distFunc.getIndex().getLocalProjection(q).getCorrelationDimension(); + DistanceResultPair q = seeds.remove(0); + int corrDim_q = distFunc.getIndex().getLocalProjection(q).getCorrelationDimension(); // q forms no lambda-dim hyperplane if(corrDim_q > lambda) { continue; @@ -314,22 +320,22 @@ public abstract class AbstractProjectedDBSCAN, V ext List> reachables = rangeQuery.getRangeForDBID(q, epsilon); if(reachables.size() > minpts) { for(DistanceResultPair r : reachables) { - Integer corrDim_r = distFunc.getIndex().getLocalProjection(r.getDBID()).getCorrelationDimension(); + int corrDim_r = distFunc.getIndex().getLocalProjection(r).getCorrelationDimension(); // r is not reachable from q if(corrDim_r > lambda) { continue; } - boolean inNoise = noise.contains(r.getDBID()); - boolean unclassified = !processedIDs.contains(r.getDBID()); + boolean inNoise = noise.contains(r); + boolean unclassified = !processedIDs.contains(r); if(inNoise || unclassified) { if(unclassified) { seeds.add(r); } - currentCluster.add(r.getDBID()); - processedIDs.add(r.getDBID()); + currentCluster.add(r); + processedIDs.add(r); if(inNoise) { - noise.remove(r.getDBID()); + noise.remove(r); } if(objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), getLogger()); @@ -349,9 +355,7 @@ public abstract class AbstractProjectedDBSCAN, V ext resultList.add(currentCluster); } else { - for(DBID id : currentCluster) { - noise.add(id); - } + noise.addDBIDs(currentCluster); noise.add(startObjectID); processedIDs.add(startObjectID); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java index 5ec59777..8f637460 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java @@ -47,5 +47,5 @@ import de.lmu.ifi.dbs.elki.database.Database; */ public interface ClusteringAlgorithm> extends Algorithm { @Override - C run(Database database) throws IllegalStateException; + C run(Database database); } \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java index b59af555..6bafa9e9 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java @@ -35,6 +35,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.QueryUtil; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; @@ -141,9 +142,9 @@ public class DBSCAN> extends AbstractDistanceBasedAlgor noise = DBIDUtil.newHashSet(); processedIDs = DBIDUtil.newHashSet(size); if(size >= minpts) { - for(DBID id : relation.iterDBIDs()) { - if(!processedIDs.contains(id)) { - expandCluster(relation, rangeQuery, id, objprog, clusprog); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + if(!processedIDs.contains(iditer)) { + expandCluster(relation, rangeQuery, iditer.getDBID(), objprog, clusprog); } if(objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), logger); @@ -155,8 +156,8 @@ public class DBSCAN> extends AbstractDistanceBasedAlgor } } else { - for(DBID id : relation.iterDBIDs()) { - noise.add(id); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + noise.add(iditer); if(objprog != null && clusprog != null) { objprog.setProcessed(noise.size(), logger); clusprog.setProcessed(resultList.size(), logger); @@ -210,35 +211,33 @@ public class DBSCAN> extends AbstractDistanceBasedAlgor // try to expand the cluster ModifiableDBIDs currentCluster = DBIDUtil.newArray(); for(DistanceResultPair seed : seeds) { - DBID nextID = seed.getDBID(); - if(!processedIDs.contains(nextID)) { - currentCluster.add(nextID); - processedIDs.add(nextID); + if(!processedIDs.contains(seed)) { + currentCluster.add(seed); + processedIDs.add(seed); } - else if(noise.contains(nextID)) { - currentCluster.add(nextID); - noise.remove(nextID); + else if(noise.contains(seed)) { + currentCluster.add(seed); + noise.remove(seed); } } seeds.remove(0); while(seeds.size() > 0) { - DBID o = seeds.remove(0).getDBID(); + DistanceResultPair o = seeds.remove(0); List> neighborhood = rangeQuery.getRangeForDBID(o, epsilon); if(neighborhood.size() >= minpts) { for(DistanceResultPair neighbor : neighborhood) { - DBID p = neighbor.getDBID(); - boolean inNoise = noise.contains(p); - boolean unclassified = !processedIDs.contains(p); + boolean inNoise = noise.contains(neighbor); + boolean unclassified = !processedIDs.contains(neighbor); if(inNoise || unclassified) { if(unclassified) { seeds.add(neighbor); } - currentCluster.add(p); - processedIDs.add(p); + currentCluster.add(neighbor); + processedIDs.add(neighbor); if(inNoise) { - noise.remove(p); + noise.remove(neighbor); } } } @@ -258,9 +257,7 @@ public class DBSCAN> extends AbstractDistanceBasedAlgor resultList.add(currentCluster); } else { - for(DBID id : currentCluster) { - noise.add(id); - } + noise.addDBIDs(currentCluster); noise.add(startObjectID); processedIDs.add(startObjectID); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java index f1e6c945..a0780e3d 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java @@ -24,7 +24,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; */ import java.util.Collection; -import java.util.Iterator; import java.util.List; import java.util.Set; @@ -36,6 +35,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.DistanceUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; @@ -201,11 +201,11 @@ public class DeLiClu, D extends Distance> exte * @return the id of the start object for the run method */ private DBID getStartObject(Relation relation) { - Iterator it = relation.iterDBIDs(); - if(!it.hasNext()) { + DBIDIter it = relation.iterDBIDs(); + if(!it.valid()) { return null; } - return it.next(); + return it.getDBID(); } /** diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java index a70a3f6f..63ebbabb 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java @@ -39,7 +39,8 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -170,28 +171,31 @@ public class EM> extends AbstractAlgorithm means = initializer.chooseInitialMeans(relation, k, EuclideanDistanceFunction.STATIC); + List means = new ArrayList(); + for(NumberVector nv : initializer.chooseInitialMeans(relation, k, EuclideanDistanceFunction.STATIC)) { + means.add(nv.getColumnVector()); + } List covarianceMatrices = new ArrayList(k); - List normDistrFactor = new ArrayList(k); + double[] normDistrFactor = new double[k]; List invCovMatr = new ArrayList(k); - List clusterWeights = new ArrayList(k); + double[] clusterWeights = new double[k]; probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class); final int dimensionality = means.get(0).getDimensionality(); for(int i = 0; i < k; i++) { Matrix m = Matrix.identity(dimensionality, dimensionality); covarianceMatrices.add(m); - normDistrFactor.add(1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * m.det())); + normDistrFactor[i] = 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * m.det()); invCovMatr.add(m.inverse()); - clusterWeights.add(1.0 / k); + clusterWeights[i] = 1.0 / k; if(logger.isDebuggingFinest()) { StringBuffer msg = new StringBuffer(); msg.append(" model ").append(i).append(":\n"); msg.append(" mean: ").append(means.get(i)).append("\n"); msg.append(" m:\n").append(FormatUtil.format(m, " ")).append("\n"); msg.append(" m.det(): ").append(m.det()).append("\n"); - msg.append(" cluster weight: ").append(clusterWeights.get(i)).append("\n"); - msg.append(" normDistFact: ").append(normDistrFactor.get(i)).append("\n"); + msg.append(" cluster weight: ").append(clusterWeights[i]).append("\n"); + msg.append(" normDistFact: ").append(normDistrFactor[i]).append("\n"); logger.debugFine(msg.toString()); } } @@ -216,31 +220,31 @@ public class EM> extends AbstractAlgorithm> extends AbstractAlgorithm> extends AbstractAlgorithm> extends AbstractAlgorithm> result = new Clustering>("EM Clustering", "em-clustering"); @@ -309,25 +313,25 @@ public class EM> extends AbstractAlgorithm database, List normDistrFactor, List means, List invCovMatr, List clusterWeights, WritableDataStore probClusterIGivenX) { + protected double assignProbabilitiesToInstances(Relation database, double[] normDistrFactor, List means, List invCovMatr, double[] clusterWeights, WritableDataStore probClusterIGivenX) { double emSum = 0.0; - for(DBID id : database.iterDBIDs()) { - Vector x = database.get(id).getColumnVector(); - List probabilities = new ArrayList(k); + for(DBIDIter iditer = database.iterDBIDs(); iditer.valid(); iditer.advance()) { + Vector x = database.get(iditer).getColumnVector(); + double[] probabilities = new double[k]; for(int i = 0; i < k; i++) { Vector difference = x.minus(means.get(i)); double rowTimesCovTimesCol = difference.transposeTimesTimes(invCovMatr.get(i), difference); double power = rowTimesCovTimesCol / 2.0; - double prob = normDistrFactor.get(i) * Math.exp(-power); + double prob = normDistrFactor[i] * Math.exp(-power); if(logger.isDebuggingFinest()) { logger.debugFinest(" difference vector= ( " + difference.toString() + " )\n" + " difference:\n" + FormatUtil.format(difference, " ") + "\n" + " rowTimesCovTimesCol:\n" + rowTimesCovTimesCol + "\n" + " power= " + power + "\n" + " prob=" + prob + "\n" + " inv cov matrix: \n" + FormatUtil.format(invCovMatr.get(i), " ")); } - probabilities.add(prob); + probabilities[i] = prob; } double priorProbability = 0.0; for(int i = 0; i < k; i++) { - priorProbability += probabilities.get(i) * clusterWeights.get(i); + priorProbability += probabilities[i] * clusterWeights[i]; } double logP = Math.max(Math.log(priorProbability), MIN_LOGLIKELIHOOD); if(!Double.isNaN(logP)) { @@ -337,16 +341,16 @@ public class EM> extends AbstractAlgorithm= 0.0); - assert (clusterWeights.get(i) >= 0.0); + assert (clusterWeights[i] >= 0.0); // do not divide by zero! if(priorProbability == 0.0) { clusterProbabilities[i] = 0.0; } else { - clusterProbabilities[i] = probabilities.get(i) / priorProbability * clusterWeights.get(i); + clusterProbabilities[i] = probabilities[i] / priorProbability * clusterWeights[i]; } } - probClusterIGivenX.put(id, clusterProbabilities); + probClusterIGivenX.put(iditer, clusterProbabilities); } return emSum; @@ -358,7 +362,7 @@ public class EM> extends AbstractAlgorithm> extends AbstractDistanceBasedAlgor if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction && DoubleDistance.class.isInstance(epsilon)) { // Optimized codepath for double-based distances. Avoids Java // boxing/unboxing. - for(DBID id : relation.iterDBIDs()) { - if(!processedIDs.contains(id)) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + if(!processedIDs.contains(iditer)) { // We need to do some ugly casts to be able to run the optimized version, unfortunately. @SuppressWarnings("unchecked") final ClusterOrderResult doubleClusterOrder = ClusterOrderResult.class.cast(clusterOrder); @SuppressWarnings("unchecked") final RangeQuery doubleRangeQuery = RangeQuery.class.cast(rangeQuery); final DoubleDistance depsilon = DoubleDistance.class.cast(epsilon); - expandClusterOrderDouble(doubleClusterOrder, database, doubleRangeQuery, id, depsilon, progress); + expandClusterOrderDouble(doubleClusterOrder, database, doubleRangeQuery, iditer.getDBID(), depsilon, progress); } } } else { - for(DBID id : relation.iterDBIDs()) { - if(!processedIDs.contains(id)) { - expandClusterOrder(clusterOrder, database, rangeQuery, id, epsilon, progress); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + if(!processedIDs.contains(iditer)) { + expandClusterOrder(clusterOrder, database, rangeQuery, iditer.getDBID(), epsilon, progress); } } } @@ -194,7 +195,7 @@ public class OPTICS> extends AbstractDistanceBasedAlgor D coreDistance = last.getDistance(); for(DistanceResultPair neighbor : neighbors) { - if(processedIDs.contains(neighbor.getDBID())) { + if(processedIDs.contains(neighbor)) { continue; } D reachability = DistanceUtil.max(neighbor.getDistance(), coreDistance); @@ -234,7 +235,7 @@ public class OPTICS> extends AbstractDistanceBasedAlgor double coreDistance = ((DoubleDistanceResultPair) last).getDoubleDistance(); for(DistanceResultPair neighbor : neighbors) { - if(processedIDs.contains(neighbor.getDBID())) { + if(processedIDs.contains(neighbor)) { continue; } double reachability = Math.max(((DoubleDistanceResultPair) neighbor).getDoubleDistance(), coreDistance); @@ -247,7 +248,7 @@ public class OPTICS> extends AbstractDistanceBasedAlgor double coreDistance = last.getDistance().doubleValue(); for(DistanceResultPair neighbor : neighbors) { - if(processedIDs.contains(neighbor.getDBID())) { + if(processedIDs.contains(neighbor)) { continue; } double reachability = Math.max(neighbor.getDistance().doubleValue(), coreDistance); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java index d6c5872a..3ead6f3e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java @@ -39,7 +39,7 @@ import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderResult; */ public interface OPTICSTypeAlgorithm> extends Algorithm { @Override - ClusterOrderResult run(Database database) throws IllegalStateException; + ClusterOrderResult run(Database database); /** * Get the minpts value used. Needed for OPTICS Xi etc. diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java index 45b12c43..2aa38bdd 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java @@ -27,7 +27,6 @@ import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -49,6 +48,8 @@ import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; import de.lmu.ifi.dbs.elki.database.datastore.WritableRecordStore; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDMIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; @@ -144,7 +145,8 @@ public class SLINK> extends AbstractDistanceBasedAlgori ModifiableDBIDs processedIDs = DBIDUtil.newArray(relation.size()); // apply the algorithm - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); step1(id); step2(id, processedIDs, distQuery, m); step3(id, processedIDs, m); @@ -200,7 +202,8 @@ public class SLINK> extends AbstractDistanceBasedAlgori * @param distFunc Distance function to use */ private void step2(DBID newID, DBIDs processedIDs, DistanceQuery distFunc, WritableDataStore m) { - for(DBID id : processedIDs) { + for(DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { + DBID id = it.getDBID(); // M(i) = dist(i, n+1) m.put(id, distFunc.distance(id, newID)); } @@ -215,7 +218,8 @@ public class SLINK> extends AbstractDistanceBasedAlgori */ private void step3(DBID newID, DBIDs processedIDs, WritableDataStore m) { // for i = 1..n - for(DBID id : processedIDs) { + for(DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { + DBID id = it.getDBID(); D l_i = lambda.get(id); D m_i = m.get(id); DBID p_i = pi.get(id); @@ -247,7 +251,8 @@ public class SLINK> extends AbstractDistanceBasedAlgori */ private void step4(DBID newID, DBIDs processedIDs) { // for i = 1..n - for(DBID id : processedIDs) { + for(DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { + DBID id = it.getDBID(); D l_i = lambda.get(id); D lp_i = lambda.get(pi.get(id)); @@ -303,7 +308,8 @@ public class SLINK> extends AbstractDistanceBasedAlgori // extract the child clusters Map cluster_ids = new HashMap(); Map cluster_distances = new HashMap(); - for(DBID id : ids) { + for(DBIDIter it = ids.iter(); it.valid(); it.advance()) { + DBID id = it.getDBID(); DBID lastObjectInCluster = lastObjectInCluster(id, stopdist, pi, lambda); ModifiableDBIDs cluster = cluster_ids.get(lastObjectInCluster); if(cluster == null) { @@ -387,7 +393,7 @@ public class SLINK> extends AbstractDistanceBasedAlgori } // right child DBID rightID = pi.get(leftID); - if(leftID.equals(rightID)) { + if(leftID.sameDBID(rightID)) { break; } Cluster> right = nodes.get(rightID); @@ -472,11 +478,12 @@ public class SLINK> extends AbstractDistanceBasedAlgori FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Extracting clusters", ids.size(), logger) : null; - for(DBID cur : order) { - DBID dest = pi.get(cur); - D l = lambda.get(cur); + for(DBIDIter it = order.iter(); it.valid(); it.advance()) { + DBID dest = pi.get(it); + D l = lambda.get(it); // logger.debugFine("DBID " + cur.toString() + " dist: " + l.toString()); if(stopdist != null && stopdist.compareTo(l) > 0) { + DBID cur = it.getDBID(); ModifiableDBIDs curset = cids.remove(cur); ModifiableDBIDs destset = cids.get(dest); if(destset == null) { @@ -511,13 +518,11 @@ public class SLINK> extends AbstractDistanceBasedAlgori Cluster cluster = new Cluster(cname, clusids, ClusterModel.CLUSTER, hier); // Collect child clusters and clean up the cluster ids, keeping only // "new" objects. - Iterator iter = clusids.iterator(); - while(iter.hasNext()) { - DBID child = iter.next(); - Cluster chiclus = clusters.get(child); + for(DBIDMIter iter = clusids.iter(); iter.valid(); iter.advance()) { + Cluster chiclus = clusters.get(iter); if(chiclus != null) { hier.add(cluster, chiclus); - clusters.remove(child); + clusters.remove(iter); iter.remove(); } } @@ -545,7 +550,7 @@ public class SLINK> extends AbstractDistanceBasedAlgori cids.put(dest, destset); destset.add(dest); } - destset.add(cur); + destset.add(it); } } // Decrement counter diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java index 7c3a13c9..ae612b2a 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java @@ -37,6 +37,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery; @@ -154,9 +155,9 @@ public class SNNClustering extends AbstractAlgorithm> imple noise = DBIDUtil.newHashSet(); processedIDs = DBIDUtil.newHashSet(relation.size()); if(relation.size() >= minpts) { - for(DBID id : snnInstance.getRelation().iterDBIDs()) { + for(DBIDIter id = snnInstance.getRelation().iterDBIDs(); id.valid(); id.advance()) { if(!processedIDs.contains(id)) { - expandCluster(snnInstance, id, objprog, clusprog); + expandCluster(snnInstance, id.getDBID(), objprog, clusprog); if(processedIDs.size() == relation.size() && noise.size() == 0) { break; } @@ -168,7 +169,7 @@ public class SNNClustering extends AbstractAlgorithm> imple } } else { - for(DBID id : snnInstance.getRelation().iterDBIDs()) { + for(DBIDIter id = snnInstance.getRelation().iterDBIDs(); id.valid(); id.advance()) { noise.add(id); if(objprog != null && clusprog != null) { objprog.setProcessed(noise.size(), logger); @@ -202,9 +203,9 @@ public class SNNClustering extends AbstractAlgorithm> imple */ protected ArrayModifiableDBIDs findSNNNeighbors(SimilarityQuery snnInstance, DBID queryObject) { ArrayModifiableDBIDs neighbors = DBIDUtil.newArray(); - for(DBID id : snnInstance.getRelation().iterDBIDs()) { - if(snnInstance.similarity(queryObject, id).compareTo(epsilon) >= 0) { - neighbors.add(id); + for(DBIDIter iditer = snnInstance.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { + if(snnInstance.similarity(queryObject, iditer).compareTo(epsilon) >= 0) { + neighbors.add(iditer); } } return neighbors; @@ -237,7 +238,7 @@ public class SNNClustering extends AbstractAlgorithm> imple // try to expand the cluster ModifiableDBIDs currentCluster = DBIDUtil.newArray(); - for(DBID seed : seeds) { + for(DBIDIter seed = seeds.iter(); seed.valid(); seed.advance()) { if(!processedIDs.contains(seed)) { currentCluster.add(seed); processedIDs.add(seed); @@ -253,7 +254,8 @@ public class SNNClustering extends AbstractAlgorithm> imple ArrayModifiableDBIDs neighborhood = findSNNNeighbors(snnInstance, o); if(neighborhood.size() >= minpts) { - for(DBID p : neighborhood) { + for(DBIDIter iter = neighborhood.iter(); iter.valid(); iter.advance()) { + DBID p = iter.getDBID(); boolean inNoise = noise.contains(p); boolean unclassified = !processedIDs.contains(p); if(inNoise || unclassified) { @@ -283,9 +285,7 @@ public class SNNClustering extends AbstractAlgorithm> imple resultList.add(currentCluster); } else { - for(DBID id : currentCluster) { - noise.add(id); - } + noise.addDBIDs(currentCluster); noise.add(startObjectID); processedIDs.add(startObjectID); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java index b877415e..e4c6a123 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java @@ -48,7 +48,7 @@ import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ProxyDatabase; import de.lmu.ifi.dbs.elki.database.QueryUtil; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; @@ -86,9 +86,9 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; * Provides the CASH algorithm, an subspace clustering algorithm based on the * Hough transform. * - * Note: CASH requires explicitly setting the input parser other than default to - * {@link de.lmu.ifi.dbs.elki.datasource.parser.ParameterizationFunctionLabelParser}: - * (in the MiniGui, set option: dbc.parser ParameterizationFunctionLabelParser). + * Note: CASH requires explicitly setting the input vector type to + * {@link ParameterizationFunction}: + * (in the MiniGui, set option: parser.vector-type ParameterizationFunction). * *

* Reference: E. Achtert, C. Böhm, J. David, P. Kröger, A. Zimek: Robust @@ -503,9 +503,9 @@ public class CASH extends AbstractAlgorithm> implements Cluste proxy.addRelation(prep); // Project - for(DBID id : ids) { - ParameterizationFunction f = project(basis, relation.get(id)); - prep.set(id, f); + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + ParameterizationFunction f = project(basis, relation.get(iter)); + prep.set(iter, f); } if(logger.isDebugging()) { @@ -662,8 +662,8 @@ public class CASH extends AbstractAlgorithm> implements Cluste double d_min = Double.POSITIVE_INFINITY; double d_max = Double.NEGATIVE_INFINITY; - for(DBID id : relation.iterDBIDs()) { - ParameterizationFunction f = relation.get(id); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + ParameterizationFunction f = relation.get(iditer); HyperBoundingBox minMax = f.determineAlphaMinMax(box); double f_min = f.function(SpatialUtil.getMin(minMax)); double f_max = f.function(SpatialUtil.getMax(minMax)); @@ -709,11 +709,11 @@ public class CASH extends AbstractAlgorithm> implements Cluste ids.addDBIDs(interval.getIDs()); // Search for nearby vectors in original database - for(DBID id : relation.iterDBIDs()) { - DoubleVector v = new DoubleVector(relation.get(id).getColumnVector().getArrayRef()); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DoubleVector v = new DoubleVector(relation.get(iditer).getColumnVector().getArrayRef()); DoubleDistance d = df.distance(v, centroid); if(d.compareTo(eps) < 0) { - ids.add(id); + ids.add(iditer); } } @@ -735,15 +735,15 @@ public class CASH extends AbstractAlgorithm> implements Cluste private Database buildDerivatorDB(Relation relation, CASHInterval interval) throws UnableToComplyException { DBIDs ids = interval.getIDs(); ProxyDatabase proxy = new ProxyDatabase(ids); - int dim = relation.get(ids.iterator().next()).getDimensionality(); + int dim = DatabaseUtil.dimensionality(relation); SimpleTypeInformation type = new VectorFieldTypeInformation(DoubleVector.class, dim, new DoubleVector(new double[dim])); MaterializedRelation prep = new MaterializedRelation(proxy, type, ids); proxy.addRelation(prep); // Project - for(DBID id : ids) { - DoubleVector v = new DoubleVector(relation.get(id).getColumnVector().getArrayRef()); - prep.set(id, v); + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + DoubleVector v = new DoubleVector(relation.get(iter).getColumnVector().getArrayRef()); + prep.set(iter, v); } if(logger.isDebugging()) { @@ -800,15 +800,15 @@ public class CASH extends AbstractAlgorithm> implements Cluste */ private Database buildDerivatorDB(Relation relation, DBIDs ids) throws UnableToComplyException { ProxyDatabase proxy = new ProxyDatabase(ids); - int dim = relation.get(ids.iterator().next()).getDimensionality(); + int dim = DatabaseUtil.dimensionality(relation); SimpleTypeInformation type = new VectorFieldTypeInformation(DoubleVector.class, dim, new DoubleVector(new double[dim])); MaterializedRelation prep = new MaterializedRelation(proxy, type, ids); proxy.addRelation(prep); // Project - for(DBID id : ids) { - DoubleVector v = new DoubleVector(relation.get(id).getColumnVector().getArrayRef()); - prep.set(id, v); + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + DoubleVector v = new DoubleVector(relation.get(iter).getColumnVector().getArrayRef()); + prep.set(iter, v); } return proxy; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java index 575bf117..1d41d37e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java @@ -41,6 +41,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.ProxyDatabase; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; @@ -176,7 +177,7 @@ public class COPAC, D extends Distance> extends * @return Clustering result */ @SuppressWarnings("unchecked") - public Clustering run(Relation relation) throws IllegalStateException { + public Clustering run(Relation relation) { if(logger.isVerbose()) { logger.verbose("Running COPAC on db size = " + relation.size() + " with dimensionality = " + DatabaseUtil.dimensionality(relation)); } @@ -189,14 +190,14 @@ public class COPAC, D extends Distance> extends FiniteProgress partitionProgress = logger.isVerbose() ? new FiniteProgress("Partitioning", relation.size(), logger) : null; int processed = 1; - for(DBID id : relation.iterDBIDs()) { - Integer corrdim = preprocin.getLocalProjection(id).getCorrelationDimension(); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + int corrdim = preprocin.getLocalProjection(iditer).getCorrelationDimension(); if(!partitionMap.containsKey(corrdim)) { partitionMap.put(corrdim, DBIDUtil.newArray()); } - partitionMap.get(corrdim).add(id); + partitionMap.get(corrdim).add(iditer); if(partitionProgress != null) { partitionProgress.setProcessed(processed++, logger); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java index af4f677f..b57a6e29 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java @@ -118,7 +118,7 @@ public class ERiC> extends AbstractAlgorithm> run(Relation relation) throws IllegalStateException { + public Clustering> run(Relation relation) { final int dimensionality = DatabaseUtil.dimensionality(relation); StepProgress stepprog = logger.isVerbose() ? new StepProgress(3) : null; @@ -291,7 +291,7 @@ public class ERiC> extends AbstractAlgorithm>>> clusterMap, DistanceQuery query) throws IllegalStateException { + private void buildHierarchy(SortedMap>>> clusterMap, DistanceQuery query) { StringBuffer msg = new StringBuffer(); DBSCAN dbscan = ClassGenericsUtil.castWithGenericsOrNull(DBSCAN.class, copacAlgorithm.getPartitionAlgorithm(query)); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java index 41ee1f69..b8942de8 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java @@ -35,7 +35,7 @@ import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; @@ -175,9 +175,9 @@ public class LMCLUS extends AbstractAlgorithm> { break; } ModifiableDBIDs subset = DBIDUtil.newArray(current.size()); - for(DBID id : current) { - if(deviation(relation.get(id).getColumnVector().minusEquals(separation.originV), separation.basis) < separation.threshold) { - subset.add(id); + for(DBIDIter iter = current.iter(); iter.valid(); iter.advance()) { + if(deviation(relation.get(iter).getColumnVector().minusEquals(separation.originV), separation.basis) < separation.threshold) { + subset.add(iter); } } // logger.verbose("size:"+subset.size()); @@ -265,16 +265,16 @@ public class LMCLUS extends AbstractAlgorithm> { int remaining_retries = 100; for(int i = 1; i <= samples; i++) { DBIDs sample = DBIDUtil.randomSample(currentids, dimension + 1, r.nextLong()); - final Iterator iter = sample.iterator(); + final DBIDIter iter = sample.iter(); // Use first as origin - DBID origin = iter.next(); - Vector originV = relation.get(origin).getColumnVector(); + Vector originV = relation.get(iter).getColumnVector(); + iter.advance(); // Build orthogonal basis from remainder Matrix basis; { List vectors = new ArrayList(sample.size() - 1); - while(iter.hasNext()) { - Vector vec = relation.get(iter.next()).getColumnVector(); + for(;iter.valid(); iter.advance()) { + Vector vec = relation.get(iter).getColumnVector(); vectors.add(vec.minusEquals(originV)); } // generate orthogonal basis @@ -292,12 +292,12 @@ public class LMCLUS extends AbstractAlgorithm> { // Generate and fill a histogram. FlexiHistogram histogram = FlexiHistogram.DoubleSumHistogram(BINS); double w = 1.0 / currentids.size(); - for(DBID point : currentids) { + for(DBIDIter iter2 = currentids.iter(); iter2.valid(); iter2.advance()) { // Skip sampled points - if(sample.contains(point)) { + if(sample.contains(iter2)) { continue; } - Vector vec = relation.get(point).getColumnVector().minusEquals(originV); + Vector vec = relation.get(iter2).getColumnVector().minusEquals(originV); final double distance = deviation(vec, basis); histogram.aggregate(distance, w); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java index eb5608fc..2e9f4a9b 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java @@ -38,6 +38,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; @@ -139,8 +140,11 @@ public class ORCLUS> extends AbstractProjectedClust /** * Performs the ORCLUS algorithm on the given database. + * + * @param database Database + * @param relation Relation */ - public Clustering run(Database database, Relation relation) throws IllegalStateException { + public Clustering run(Database database, Relation relation) { try { DistanceQuery distFunc = this.getDistanceQuery(database); // current dimensionality associated with each seed @@ -211,8 +215,8 @@ public class ORCLUS> extends AbstractProjectedClust DBIDs randomSample = DBIDUtil.randomSample(database.getDBIDs(), k, seed); V factory = DatabaseUtil.assumeVectorField(database).getFactory(); List seeds = new ArrayList(); - for(DBID id : randomSample) { - seeds.add(new ORCLUSCluster(database.get(id), id, factory)); + for(DBIDIter iter = randomSample.iter(); iter.valid(); iter.advance()) { + seeds.add(new ORCLUSCluster(database.get(iter), iter.getDBID(), factory)); } return seeds; } @@ -240,10 +244,8 @@ public class ORCLUS> extends AbstractProjectedClust } // for each data point o do - Iterator it = database.iterDBIDs(); - while(it.hasNext()) { - DBID id = it.next(); - V o = database.get(id); + for (DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { + V o = database.get(it); DoubleDistance minDist = null; ORCLUSCluster minCluster = null; @@ -260,7 +262,7 @@ public class ORCLUS> extends AbstractProjectedClust } // add p to the cluster with the least value of projected distance assert minCluster != null; - minCluster.objectIDs.add(id); + minCluster.objectIDs.add(it); } // recompute the seed in each clusters @@ -285,10 +287,9 @@ public class ORCLUS> extends AbstractProjectedClust // covariance matrix of cluster // Matrix covariance = Util.covarianceMatrix(database, cluster.objectIDs); List> results = new ArrayList>(cluster.objectIDs.size()); - for(Iterator it = cluster.objectIDs.iterator(); it.hasNext();) { - DBID id = it.next(); - DoubleDistance distance = distFunc.distance(cluster.centroid, database.get(id)); - DistanceResultPair qr = new GenericDistanceResultPair(distance, id); + for(DBIDIter it = cluster.objectIDs.iter(); it.valid(); it.advance()) { + DoubleDistance distance = distFunc.distance(cluster.centroid, database.get(it)); + DistanceResultPair qr = new GenericDistanceResultPair(distance, it.getDBID()); results.add(qr); } Collections.sort(results); @@ -407,9 +408,8 @@ public class ORCLUS> extends AbstractProjectedClust DoubleDistance sum = getDistanceFunction().getDistanceFactory().nullDistance(); V c_proj = projection(c_ij, c_ij.centroid, factory); - for(DBID id : c_ij.objectIDs) { - V o = database.get(id); - V o_proj = projection(c_ij, o, factory); + for(DBIDIter iter = c_ij.objectIDs.iter(); iter.valid(); iter.advance()) { + V o_proj = projection(c_ij, database.get(iter), factory); DoubleDistance dist = distFunc.distance(o_proj, c_proj); sum = sum.plus(dist.times(dist)); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java index 86e045cb..b0a12832 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java @@ -30,6 +30,7 @@ import de.lmu.ifi.dbs.elki.data.HyperBoundingBox; import de.lmu.ifi.dbs.elki.data.ParameterizationFunction; import de.lmu.ifi.dbs.elki.data.spatial.SpatialUtil; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; @@ -114,7 +115,8 @@ public class CASHIntervalSplit { f_maxima.put(interval, maxima); } - for(DBID id : superSetIDs) { + for(DBIDIter iter = superSetIDs.iter(); iter.valid(); iter.advance()) { + DBID id = iter.getDBID(); Double f_min = minima.get(id); Double f_max = maxima.get(id); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/CorePredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/CorePredicate.java new file mode 100644 index 00000000..e75a89dc --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/CorePredicate.java @@ -0,0 +1,80 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ + +import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; + +/** + * Predicate for GeneralizedDBSCAN to evaluate whether a point is a core point + * or not. + * + * Note the Factory/Instance split of this interface. + * + * @author Erich Schubert + * + * @apiviz.has Instance + */ +public interface CorePredicate { + /** + * Constant for the generic type {@code List>} + */ + public static final String NEIGHBOR_LIST = "neighborhood-list"; + + /** + * Instantiate for a database. + * + * @param database Database to instantiate for + * @param type Type to instantiate for + * @return Instance + */ + public Instance instantiate(Database database, SimpleTypeInformation type); + + /** + * Test whether the neighborhood type T is accepted by this predicate. + * + * @param type Type information + * @return true when the type is accepted + */ + public boolean acceptsType(SimpleTypeInformation type); + + /** + * Instance for a particular data set. + * + * @author Erich Schubert + * + * @param actual type + */ + public static interface Instance { + /** + * Decide whether the point is a core point, based on its neighborhood. + * + * @param point Query point + * @param neighbors Neighbors + * @return core point property + */ + public boolean isCorePoint(DBIDRef point, T neighbors); + } +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java new file mode 100644 index 00000000..cb24e8f1 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java @@ -0,0 +1,268 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ + +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.QueryUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.query.DistanceDBIDResult; +import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * The default DBSCAN and OPTICS neighbor predicate, using an + * epsilon-neighborhood. + * + *

+ * Reference:
+ * M. Ester, H.-P. Kriegel, J. Sander, and X. Xu: A Density-Based Algorithm for + * Discovering Clusters in Large Spatial Databases with Noise.
+ * In Proc. 2nd Int. Conf. on Knowledge Discovery and Data Mining (KDD '96), + * Portland, OR, 1996. + *

+ * + * @author Erich Schubert + * + * @param Distance type + */ +@Reference(authors = "M. Ester, H.-P. Kriegel, J. Sander, and X. Xu", title = "A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise", booktitle = "Proc. 2nd Int. Conf. on Knowledge Discovery and Data Mining (KDD '96), Portland, OR, 1996", url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.71.1980") +public class EpsilonNeighborPredicate> implements NeighborPredicate { + /** + * Range to query with + */ + D epsilon; + + /** + * Distance function to use + */ + DistanceFunction distFunc; + + /** + * Full constructor. + * + * @param epsilon Epsilon value + * @param distFunc Distance function to use + */ + public EpsilonNeighborPredicate(D epsilon, DistanceFunction distFunc) { + super(); + this.epsilon = epsilon; + this.distFunc = distFunc; + } + + @SuppressWarnings("unchecked") + @Override + public Instance instantiate(Database database, SimpleTypeInformation type) { + if(TypeUtil.DBIDS.isAssignableFromType(type)) { + DistanceQuery dq = QueryUtil.getDistanceQuery(database, distFunc); + RangeQuery rq = database.getRangeQuery(dq); + return (Instance) new DBIDInstance(epsilon, rq, dq.getRelation().getDBIDs()); + } + if(TypeUtil.NEIGHBORLIST.isAssignableFromType(type)) { + DistanceQuery dq = QueryUtil.getDistanceQuery(database, distFunc); + RangeQuery rq = database.getRangeQuery(dq); + return (Instance) new NeighborListInstance(epsilon, rq, dq.getRelation().getDBIDs()); + } + throw new AbortException("Incompatible predicate types"); + } + + @Override + public SimpleTypeInformation[] getOutputType() { + return new SimpleTypeInformation[] { TypeUtil.DBIDS, TypeUtil.NEIGHBORLIST }; + } + + @Override + public TypeInformation getInputTypeRestriction() { + return distFunc.getInputTypeRestriction(); + } + + /** + * Instance for a particular data set. + * + * @author Erich Schubert + */ + public static class DBIDInstance> implements NeighborPredicate.Instance { + /** + * Range to query with + */ + D epsilon; + + /** + * Range query to use on the database. + */ + RangeQuery rq; + + /** + * DBIDs to process + */ + DBIDs ids; + + /** + * Constructor. + * + * @param epsilon Epsilon + * @param rq Range query to use + * @param ids DBIDs to process + */ + public DBIDInstance(D epsilon, RangeQuery rq, DBIDs ids) { + super(); + this.epsilon = epsilon; + this.rq = rq; + this.ids = ids; + } + + @Override + public DBIDs getIDs() { + return ids; + } + + @Override + public DBIDs getNeighbors(DBIDRef reference) { + List> res = rq.getRangeForDBID(reference, epsilon); + // Throw away the actual distance values ... + ModifiableDBIDs neighbors = DBIDUtil.newHashSet(res.size()); + for(DistanceResultPair dr : res) { + neighbors.add(dr); + } + return neighbors; + } + + @Override + public void addDBIDs(ModifiableDBIDs ids, DBIDs neighbors) { + ids.addDBIDs(neighbors); + } + } + + /** + * Instance for a particular data set. + * + * @author Erich Schubert + */ + public static class NeighborListInstance> implements NeighborPredicate.Instance> { + /** + * Range to query with + */ + D epsilon; + + /** + * Range query to use on the database. + */ + RangeQuery rq; + + /** + * DBIDs to process + */ + DBIDs ids; + + /** + * Constructor. + * + * @param epsilon Epsilon + * @param rq Range query to use + * @param ids DBIDs to process + */ + public NeighborListInstance(D epsilon, RangeQuery rq, DBIDs ids) { + super(); + this.epsilon = epsilon; + this.rq = rq; + this.ids = ids; + } + + @Override + public DBIDs getIDs() { + return ids; + } + + @Override + public DistanceDBIDResult getNeighbors(DBIDRef reference) { + return rq.getRangeForDBID(reference, epsilon); + } + + @Override + public void addDBIDs(ModifiableDBIDs ids, DistanceDBIDResult neighbors) { + for(DistanceResultPair neighbor : neighbors) { + ids.add(neighbor); + } + } + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer> extends AbstractParameterizer { + /** + * Range to query with + */ + D epsilon; + + /** + * Distance function to use + */ + DistanceFunction distfun = null; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + // Get a distance function. + ObjectParameter> distanceP = new ObjectParameter>(AbstractDistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, DistanceFunction.class, EuclideanDistanceFunction.class); + D distanceFactory = null; + if(config.grab(distanceP)) { + distfun = distanceP.instantiateClass(config); + distanceFactory = distfun.getDistanceFactory(); + } + // Get the epsilon parameter + DistanceParameter epsilonP = new DistanceParameter(de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN.EPSILON_ID, distanceFactory); + if(config.grab(epsilonP)) { + epsilon = epsilonP.getValue(); + } + } + + @Override + protected EpsilonNeighborPredicate makeInstance() { + return new EpsilonNeighborPredicate(epsilon, distfun); + } + } +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/GeneralizedDBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/GeneralizedDBSCAN.java new file mode 100644 index 00000000..2e1c2093 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/GeneralizedDBSCAN.java @@ -0,0 +1,323 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ + +import gnu.trove.list.array.TIntArrayList; + +import java.util.ArrayList; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.model.ClusterModel; +import de.lmu.ifi.dbs.elki.data.model.Model; +import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Generalized DBSCAN, density-based clustering with noise. + *

+ * Reference:
+ * Jörg Sander, Martin Ester, Hans-Peter Kriegel, Xiaowei Xu:
+ * Density-Based Clustering in Spatial Databases: The Algorithm GDBSCAN and Its + * Applications
+ * In: Data Mining and Knowledge Discovery, 1998. + *

+ * + * @author Erich Schubert + * @author Arthur Zimek + * + * @apiviz.has Instance + */ +@Reference(authors = "Jörg Sander, Martin Ester, Hans-Peter Kriegel, Xiaowei Xu", title = "Density-Based Clustering in Spatial Databases: The Algorithm GDBSCAN and Its Applications", booktitle = "Data Mining and Knowledge Discovery", url = "http://dx.doi.org/10.1023/A:1009745219419") +public class GeneralizedDBSCAN extends AbstractAlgorithm> implements ClusteringAlgorithm> { + /** + * Get a logger for this algorithm + */ + final static Logging logger = Logging.getLogger(GeneralizedDBSCAN.class); + + /** + * The neighborhood predicate factory. + */ + NeighborPredicate npred; + + /** + * The core predicate factory. + */ + CorePredicate corepred; + + /** + * Constructor for parameterized algorithm. + * + * @param npred Neighbor predicate + * @param corepred Core point predicate + */ + public GeneralizedDBSCAN(NeighborPredicate npred, CorePredicate corepred) { + super(); + this.npred = npred; + this.corepred = corepred; + } + + @Override + public Clustering run(Database database) { + for (SimpleTypeInformation t : npred.getOutputType()) { + if (corepred.acceptsType(t)) { + return new Instance(npred.instantiate(database, t), corepred.instantiate(database, t)).run(); + } + } + throw new AbortException("No compatible types found."); + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(npred.getInputTypeRestriction()); + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Instance for a particular data set. + * + * @author Erich Schubert + */ + public class Instance { + /** + * The neighborhood predicate + */ + final NeighborPredicate.Instance npred; + + /** + * The core object property + */ + final CorePredicate.Instance corepred; + + /** + * Full Constructor + * + * @param npred Neighborhood predicate + * @param corepred Core object predicate + */ + public Instance(NeighborPredicate.Instance npred, CorePredicate.Instance corepred) { + super(); + this.npred = npred; + this.corepred = corepred; + } + + /** + * Run the actual DBSCAN algorithm. + * + * @return Clustering result + */ + public Clustering run() { + final DBIDs ids = npred.getIDs(); + // Setup progress logging + final FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Clustering", ids.size(), logger) : null; + final IndefiniteProgress clusprogress = logger.isVerbose() ? new IndefiniteProgress("Clusters", logger) : null; + // (Temporary) store the cluster ID assigned. + final WritableIntegerDataStore clusterids = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_TEMP, -2); + // Note: these are not exact! + final TIntArrayList clustersizes = new TIntArrayList(); + + // Implementation Note: using Integer objects should result in + // reduced memory use in the HashMap! + final int noiseid = -1; + int clusterid = 0; + int clustersize = 0; + int noisesize = 0; + // Iterate over all objects in the database. + for(DBIDIter id = ids.iter(); id.valid(); id.advance()) { + // Skip already processed ids. + if(clusterids.intValue(id) > -2) { + continue; + } + // Evaluate Neighborhood predicate + final T neighbors = npred.getNeighbors(id); + // Evaluate Core-Point predicate: + if(corepred.isCorePoint(id, neighbors)) { + clusterids.putInt(id, clusterid); + clustersize = 1 + setbasedExpandCluster(clusterid, clusterids, neighbors, progress); + // start next cluster on next iteration. + clustersizes.add(clustersize); + clustersize = 0; + clusterid += 1; + if(clusprogress != null) { + clusprogress.setProcessed(clusterid, logger); + } + } + else { + // otherwise, it's a noise point + clusterids.putInt(id, noiseid); + noisesize += 1; + } + // We've completed this element + if(progress != null) { + progress.incrementProcessed(logger); + } + } + // Finish progress logging. + if(progress != null) { + progress.ensureCompleted(logger); + } + if(clusprogress != null) { + clusprogress.setCompleted(logger); + } + + // Transform cluster ID mapping into a clustering result: + ArrayList clusterlists = new ArrayList(clusterid + 1); + // add noise cluster storage + clusterlists.add(DBIDUtil.newArray(noisesize)); + // add storage containers for clusters + for(int i = 0; i < clustersizes.size(); i++) { + clusterlists.add(DBIDUtil.newArray(clustersizes.get(i))); + } + // do the actual inversion + for(DBIDIter id = ids.iter(); id.valid(); id.advance()) { + int cluster = clusterids.intValue(id); + clusterlists.get(cluster + 1).add(id); + } + clusterids.destroy(); + + Clustering result = new Clustering("GDBSCAN", "gdbscan-clustering"); + int cid = 0; + for(ArrayModifiableDBIDs res : clusterlists) { + boolean isNoise = (cid == 0); + Cluster c = new Cluster(res, isNoise, ClusterModel.CLUSTER); + result.addCluster(c); + cid++; + } + return result; + } + + /** + * Set-based expand cluster implementation. + * + * @param clusterid ID of the current cluster. + * @param clusterids Current object to cluster mapping. + * @param neighbors Neighbors acquired by initial getNeighbors call. + * @param progress Progress logging + * + * @return cluster size; + */ + protected int setbasedExpandCluster(final int clusterid, final WritableIntegerDataStore clusterids, final T neighbors, final FiniteProgress progress) { + int clustersize = 0; + final ArrayModifiableDBIDs activeSet = DBIDUtil.newArray(); + npred.addDBIDs(activeSet, neighbors); + // run expandCluster as long as this set is non-empty (non-recursive + // implementation) + while(!activeSet.isEmpty()) { + final DBID id = activeSet.remove(activeSet.size() - 1); + clustersize += 1; + // Assign object to cluster + final int oldclus = clusterids.putInt(id, clusterid); + if(oldclus == -2) { + // expandCluster again: + // Evaluate Neighborhood predicate + final T newneighbors = npred.getNeighbors(id); + // Evaluate Core-Point predicate + if(corepred.isCorePoint(id, newneighbors)) { + // Note: the recursion is unrolled into iteration over the active + // set. + npred.addDBIDs(activeSet, newneighbors); + } + if(progress != null) { + progress.incrementProcessed(logger); + } + } + } + return clustersize; + } + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Neighborhood predicate + */ + NeighborPredicate npred = null; + + /** + * Core point predicate + */ + CorePredicate corepred = null; + + /** + * Parameter for neighborhood predicate + */ + public final static OptionID NEIGHBORHOODPRED_ID = OptionID.getOrCreateOptionID("gdbscan.neighborhood", "Neighborhood predicate for GDBSCAN"); + + /** + * Parameter for core predicate + */ + public final static OptionID COREPRED_ID = OptionID.getOrCreateOptionID("gdbscan.core", "Core point predicate for GDBSCAN"); + + @Override + protected void makeOptions(Parameterization config) { + // Neighborhood predicate + ObjectParameter npredOpt = new ObjectParameter(NEIGHBORHOODPRED_ID, NeighborPredicate.class, EpsilonNeighborPredicate.class); + if(config.grab(npredOpt)) { + npred = npredOpt.instantiateClass(config); + } + + // Core point predicate + ObjectParameter corepredOpt = new ObjectParameter(COREPRED_ID, CorePredicate.class, MinPtsCorePredicate.class); + if(config.grab(corepredOpt)) { + corepred = corepredOpt.instantiateClass(config); + } + } + + @Override + protected GeneralizedDBSCAN makeInstance() { + return new GeneralizedDBSCAN(npred, corepred); + } + } +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java new file mode 100644 index 00000000..b9852eca --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java @@ -0,0 +1,178 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ + +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; + +/** + * The DBSCAN default core point predicate -- having at least {@link #minpts} + * neighbors. + * + *

+ * Reference:
+ * M. Ester, H.-P. Kriegel, J. Sander, and X. Xu: A Density-Based Algorithm for + * Discovering Clusters in Large Spatial Databases with Noise.
+ * In Proc. 2nd Int. Conf. on Knowledge Discovery and Data Mining (KDD '96), + * Portland, OR, 1996. + *

+ * + * @author Erich Schubert + * + * @apiviz.has Instance + */ +@Reference(authors = "M. Ester, H.-P. Kriegel, J. Sander, and X. Xu", title = "A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise", booktitle = "Proc. 2nd Int. Conf. on Knowledge Discovery and Data Mining (KDD '96), Portland, OR, 1996", url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.71.1980") +public class MinPtsCorePredicate implements CorePredicate { + /** + * The minpts parameter. + */ + int minpts; + + /** + * Default constructor. + * + * @param minpts Minimum number of neighbors to be a core point. + */ + public MinPtsCorePredicate(int minpts) { + super(); + this.minpts = minpts; + } + + @SuppressWarnings("unchecked") + @Override + public Instance instantiate(Database database, SimpleTypeInformation type) { + if(TypeUtil.DBIDS.isAssignableFromType(type)) { + return (Instance) new DBIDsInstance(minpts); + } + if(TypeUtil.NEIGHBORLIST.isAssignableFromType(type)) { + return (Instance) new NeighborListInstance(minpts); + } + throw new AbortException("Incompatible predicate types"); + } + + @Override + public boolean acceptsType(SimpleTypeInformation type) { + if(TypeUtil.DBIDS.isAssignableFromType(type)) { + return true; + } + if(TypeUtil.NEIGHBORLIST.isAssignableFromType(type)) { + return true; + } + return false; + } + + /** + * Instance for a particular data set. + * + * @author Erich Schubert + */ + public static class DBIDsInstance implements CorePredicate.Instance { + /** + * The minpts parameter. + */ + int minpts; + + /** + * Constructor for this predicate. + * + * @param minpts MinPts parameter + */ + public DBIDsInstance(int minpts) { + super(); + this.minpts = minpts; + } + + @Override + public boolean isCorePoint(DBIDRef point, DBIDs neighbors) { + return neighbors.size() >= minpts; + } + } + + /** + * Instance for a particular data set. + * + * @author Erich Schubert + */ + public static class NeighborListInstance implements CorePredicate.Instance>> { + /** + * The minpts parameter. + */ + int minpts; + + /** + * Constructor for this predicate. + * + * @param minpts MinPts parameter + */ + public NeighborListInstance(int minpts) { + super(); + this.minpts = minpts; + } + + @Override + public boolean isCorePoint(DBIDRef point, List> neighbors) { + return neighbors.size() >= minpts; + } + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Minpts value + */ + int minpts; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + // Get the minpts parameter + IntParameter minptsP = new IntParameter(de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN.MINPTS_ID); + if(config.grab(minptsP)) { + minpts = minptsP.getValue(); + } + } + + @Override + protected MinPtsCorePredicate makeInstance() { + return new MinPtsCorePredicate(minpts); + } + } +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/NeighborPredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/NeighborPredicate.java new file mode 100644 index 00000000..4f9eca27 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/NeighborPredicate.java @@ -0,0 +1,94 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ + +import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; + +/** + * Get the neighbors of an object + * + * Note the Factory/Instance split of this interface. + * + * @author Erich Schubert + * + * @apiviz.has Instance + */ +public interface NeighborPredicate { + /** + * Instantiate for a database. + * + * @param database Database to instantiate for + * @return Instance + */ + public Instance instantiate(Database database, SimpleTypeInformation type); + + /** + * Input data type restriction. + * + * @return Type restriction + */ + public TypeInformation getInputTypeRestriction(); + + /** + * Output data type information. + * + * @return Type information + */ + public SimpleTypeInformation[] getOutputType(); + + /** + * Instance for a particular data set. + * + * @author Erich Schubert + */ + public static interface Instance { + /** + * Get the neighbors of a reference object for DBSCAN. + * + * @param reference Reference object + * @return Neighborhood + */ + public T getNeighbors(DBIDRef reference); + + /** + * Get the IDs the predicate is defined for. + * + * @return Database ids + */ + public DBIDs getIDs(); + + /** + * Add the neighbors to a DBID set + * + * @param ids ID set + * @param neighbors Neighbors to add + */ + public void addDBIDs(ModifiableDBIDs ids, T neighbors); + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/package-info.java new file mode 100644 index 00000000..8be23c7d --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/package-info.java @@ -0,0 +1,43 @@ +/** + *

Generalized DBSCAN.

+ * + * Generalized DBSCAN is an abstraction of the original DBSCAN idea, + * that allows the use of arbitrary "neighborhood" and "core point" predicates. + * + * For each object, the neighborhood as defined by the "neighborhood" predicate + * is retrieved - in original DBSCAN, this is the objects within an epsilon sphere + * around the query object. Then the core point predicate is evaluated to decide if + * the object is considered dense. If so, a cluster is started (or extended) to + * include the neighbors as well. + * + *

+ * Reference:
+ * Jörg Sander, Martin Ester, Hans-Peter Kriegel, Xiaowei Xu:
+ * Density-Based Clustering in Spatial Databases: The Algorithm GDBSCAN and Its + * Applications
+ * In: Data Mining and Knowledge Discovery, 1998. + *

+ */ +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan; \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java index d3c73b53..92862909 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java @@ -1,24 +1,5 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; -import de.lmu.ifi.dbs.elki.data.Clustering; -import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.data.model.MeanModel; -import de.lmu.ifi.dbs.elki.data.type.TypeInformation; -import de.lmu.ifi.dbs.elki.data.type.TypeUtil; -import de.lmu.ifi.dbs.elki.database.ids.DBID; -import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; -import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; - /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures @@ -42,37 +23,39 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; along with this program. If not, see . */ +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.VectorUtil.SortDBIDsBySingleDimension; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect; + /** * Abstract base class for k-means implementations. * * @author Erich Schubert * + * @apiviz.composedOf KMeansInitialization + * * @param Vector type * @param Distance type */ -public abstract class AbstractKMeans, D extends Distance> extends AbstractPrimitiveDistanceBasedAlgorithm, D, Clustering>> { - /** - * Parameter to specify the number of clusters to find, must be an integer - * greater than 0. - */ - public static final OptionID K_ID = OptionID.getOrCreateOptionID("kmeans.k", "The number of clusters to find."); - - /** - * Parameter to specify the number of clusters to find, must be an integer - * greater or equal to 0, where 0 means no limit. - */ - public static final OptionID MAXITER_ID = OptionID.getOrCreateOptionID("kmeans.maxiter", "The maximum number of iterations to do. 0 means no limit."); - - /** - * Parameter to specify the random generator seed. - */ - public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("kmeans.seed", "The random number generator seed."); - - /** - * Parameter to specify the initialization method - */ - public static final OptionID INIT_ID = OptionID.getOrCreateOptionID("kmeans.initialization", "Method to choose the initial means."); - +public abstract class AbstractKMeans, D extends Distance> extends AbstractPrimitiveDistanceBasedAlgorithm, D, Clustering>> implements KMeans { /** * Holds the value of {@link #K_ID}. */ @@ -94,6 +77,7 @@ public abstract class AbstractKMeans, D extends Dis * @param distanceFunction distance function * @param k k parameter * @param maxiter Maxiter parameter + * @param initializer Function to generate the initial means */ public AbstractKMeans(PrimitiveDistanceFunction, D> distanceFunction, int k, int maxiter, KMeansInitialization initializer) { super(distanceFunction); @@ -111,15 +95,15 @@ public abstract class AbstractKMeans, D extends Dis * @param clusters cluster assignment * @return true when the object was reassigned */ - protected boolean assignToNearestCluster(Relation relation, List means, List clusters) { + protected boolean assignToNearestCluster(Relation relation, List> means, List clusters) { boolean changed = false; if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { @SuppressWarnings("unchecked") final PrimitiveDoubleDistanceFunction> df = (PrimitiveDoubleDistanceFunction>) getDistanceFunction(); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double mindist = Double.POSITIVE_INFINITY; - V fv = relation.get(id); + V fv = relation.get(iditer); int minIndex = 0; for(int i = 0; i < k; i++) { double dist = df.doubleDistance(fv, means.get(i)); @@ -128,13 +112,13 @@ public abstract class AbstractKMeans, D extends Dis mindist = dist; } } - if(clusters.get(minIndex).add(id)) { + if(clusters.get(minIndex).add(iditer)) { changed = true; // Remove from previous cluster // TODO: keep a list of cluster assignments to save this search? for(int i = 0; i < k; i++) { if(i != minIndex) { - if(clusters.get(i).remove(id)) { + if(clusters.get(i).remove(iditer)) { break; } } @@ -144,9 +128,9 @@ public abstract class AbstractKMeans, D extends Dis } else { final PrimitiveDistanceFunction, D> df = getDistanceFunction(); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { D mindist = df.getDistanceFactory().infiniteDistance(); - V fv = relation.get(id); + V fv = relation.get(iditer); int minIndex = 0; for(int i = 0; i < k; i++) { D dist = df.distance(fv, means.get(i)); @@ -155,13 +139,13 @@ public abstract class AbstractKMeans, D extends Dis mindist = dist; } } - if(clusters.get(minIndex).add(id)) { + if(clusters.get(minIndex).add(iditer)) { changed = true; // Remove from previous cluster // TODO: keep a list of cluster assignments to save this search? for(int i = 0; i < k; i++) { if(i != minIndex) { - if(clusters.get(i).remove(id)) { + if(clusters.get(i).remove(iditer)) { break; } } @@ -185,31 +169,59 @@ public abstract class AbstractKMeans, D extends Dis * @param database the database containing the vectors * @return the mean vectors of the given clusters in the given database */ - protected List means(List clusters, List means, Relation database) { + protected List means(List clusters, List> means, Relation database) { List newMeans = new ArrayList(k); for(int i = 0; i < k; i++) { ModifiableDBIDs list = clusters.get(i); Vector mean = null; - for(Iterator clusterIter = list.iterator(); clusterIter.hasNext();) { - if(mean == null) { - mean = database.get(clusterIter.next()).getColumnVector(); - } - else { - mean.plusEquals(database.get(clusterIter.next()).getColumnVector()); - } - } if(list.size() > 0) { - assert mean != null; - mean.timesEquals(1.0 / list.size()); + double s = 1.0 / list.size(); + DBIDIter iter = list.iter(); + assert (iter.valid()); + mean = database.get(iter).getColumnVector().timesEquals(s); + iter.advance(); + for(; iter.valid(); iter.advance()) { + mean.plusTimesEquals(database.get(iter).getColumnVector(), s); + } } else { - mean = means.get(i); + mean = means.get(i).getColumnVector(); } newMeans.add(mean); } return newMeans; } + /** + * Returns the median vectors of the given clusters in the given database. + * + * @param clusters the clusters to compute the means + * @param medians the recent medians + * @param database the database containing the vectors + * @return the mean vectors of the given clusters in the given database + */ + protected List> medians(List clusters, List> medians, Relation database) { + final int dim = medians.get(0).getDimensionality(); + final SortDBIDsBySingleDimension sorter = new SortDBIDsBySingleDimension(database); + List> newMedians = new ArrayList>(k); + for(int i = 0; i < k; i++) { + ArrayModifiableDBIDs list = DBIDUtil.newArray(clusters.get(i)); + if(list.size() > 0) { + Vector mean = new Vector(dim); + for(int d = 0; d < dim; d++) { + sorter.setDimension(d + 1); + DBID id = QuickSelect.median(list, sorter); + mean.set(d, database.get(id).doubleValue(d + 1)); + } + newMedians.add(mean); + } + else { + newMedians.add((NumberVector) medians.get(i)); + } + } + return newMedians; + } + /** * Compute an incremental update for the mean * @@ -239,16 +251,16 @@ public abstract class AbstractKMeans, D extends Dis */ protected boolean macQueenIterate(Relation relation, List means, List clusters) { boolean changed = false; - + if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { // Raw distance function @SuppressWarnings("unchecked") final PrimitiveDoubleDistanceFunction> df = (PrimitiveDoubleDistanceFunction>) getDistanceFunction(); - + // Incremental update - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double mindist = Double.POSITIVE_INFINITY; - V fv = relation.get(id); + V fv = relation.get(iditer); int minIndex = 0; for(int i = 0; i < k; i++) { double dist = df.doubleDistance(fv, means.get(i)); @@ -261,13 +273,13 @@ public abstract class AbstractKMeans, D extends Dis for(int i = 0; i < k; i++) { ModifiableDBIDs ci = clusters.get(i); if(i == minIndex) { - if(ci.add(id)) { - incrementalUpdateMean(means.get(i), relation.get(id), ci.size(), +1); + if(ci.add(iditer)) { + incrementalUpdateMean(means.get(i), fv, ci.size(), +1); changed = true; } } - else if(ci.remove(id)) { - incrementalUpdateMean(means.get(i), relation.get(id), ci.size() + 1, -1); + else if(ci.remove(iditer)) { + incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1); changed = true; } } @@ -276,11 +288,11 @@ public abstract class AbstractKMeans, D extends Dis else { // Raw distance function final PrimitiveDistanceFunction, D> df = getDistanceFunction(); - + // Incremental update - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { D mindist = df.getDistanceFactory().infiniteDistance(); - V fv = relation.get(id); + V fv = relation.get(iditer); int minIndex = 0; for(int i = 0; i < k; i++) { D dist = df.distance(fv, means.get(i)); @@ -293,13 +305,13 @@ public abstract class AbstractKMeans, D extends Dis for(int i = 0; i < k; i++) { ModifiableDBIDs ci = clusters.get(i); if(i == minIndex) { - if(ci.add(id)) { - incrementalUpdateMean(means.get(i), relation.get(id), ci.size(), +1); + if(ci.add(iditer)) { + incrementalUpdateMean(means.get(i), fv, ci.size(), +1); changed = true; } } - else if(ci.remove(id)) { - incrementalUpdateMean(means.get(i), relation.get(id), ci.size() + 1, -1); + else if(ci.remove(iditer)) { + incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1); changed = true; } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java index b5f088fb..a8effecd 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java @@ -22,7 +22,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ -import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter; @@ -34,9 +33,9 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter; * * @param Vector type */ -public abstract class AbstractKMeansInitialization> implements KMeansInitialization { +public abstract class AbstractKMeansInitialization implements KMeansInitialization { /** - * Holds the value of {@link KMeansLloyd#SEED_ID}. + * Holds the value of {@link KMeans#SEED_ID}. */ protected Long seed; @@ -56,13 +55,13 @@ public abstract class AbstractKMeansInitialization> * * @apiviz.exclude */ - public abstract static class Parameterizer> extends AbstractParameterizer { + public abstract static class Parameterizer extends AbstractParameterizer { protected Long seed; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - LongParameter seedP = new LongParameter(AbstractKMeans.SEED_ID, true); + LongParameter seedP = new LongParameter(KMeans.SEED_ID, true); if(config.grab(seedP)) { seed = seedP.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java index 78ccd426..7a7f2867 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java @@ -23,14 +23,16 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; along with this program. If not, see . */ import java.util.ArrayList; -import java.util.Iterator; import java.util.List; import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; /** @@ -40,20 +42,30 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; * * @param Vector type */ -public class FirstKInitialMeans> extends AbstractKMeansInitialization { +public class FirstKInitialMeans implements KMeansInitialization, KMedoidsInitialization { /** * Constructor. */ public FirstKInitialMeans() { - super(null); + super(); } @Override - public List chooseInitialMeans(Relation relation, int k, PrimitiveDistanceFunction distanceFunction) { - Iterator iter = relation.iterDBIDs(); - List means = new ArrayList(k); - for(int i = 0; i < k && iter.hasNext(); i++) { - means.add(relation.get(iter.next()).getColumnVector()); + public List chooseInitialMeans(Relation relation, int k, PrimitiveDistanceFunction distanceFunction) { + DBIDIter iter = relation.iterDBIDs(); + List means = new ArrayList(k); + for(int i = 0; i < k && iter.valid(); i++, iter.advance()) { + means.add(relation.get(iter)); + } + return means; + } + + @Override + public DBIDs chooseInitialMedoids(int k, DistanceQuery distanceFunction) { + DBIDIter iter = distanceFunction.getRelation().iterDBIDs(); + ArrayModifiableDBIDs means = DBIDUtil.newArray(k); + for(int i = 0; i < k && iter.valid(); i++, iter.advance()) { + means.add(iter); } return means; } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java new file mode 100644 index 00000000..37171d4a --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java @@ -0,0 +1,55 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ + +/** + * Some constants and options shared among kmeans family algorithms. + * + * @author Erich Schubert + */ +public interface KMeans { + /** + * Parameter to specify the initialization method + */ + public static final OptionID INIT_ID = OptionID.getOrCreateOptionID("kmeans.initialization", "Method to choose the initial means."); + + /** + * Parameter to specify the number of clusters to find, must be an integer + * greater than 0. + */ + public static final OptionID K_ID = OptionID.getOrCreateOptionID("kmeans.k", "The number of clusters to find."); + + /** + * Parameter to specify the number of clusters to find, must be an integer + * greater or equal to 0, where 0 means no limit. + */ + public static final OptionID MAXITER_ID = OptionID.getOrCreateOptionID("kmeans.maxiter", "The maximum number of iterations to do. 0 means no limit."); + + /** + * Parameter to specify the random generator seed. + */ + public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("kmeans.seed", "The random number generator seed."); +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java index f4c0d9c7..9e5d69f0 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java @@ -24,19 +24,17 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; */ import java.util.List; -import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; /** * Interface for initializing K-Means * * @author Erich Schubert * - * @param Vector type + * @param Object type */ -public interface KMeansInitialization> { +public interface KMeansInitialization { /** * Choose initial means * @@ -45,5 +43,5 @@ public interface KMeansInitialization> { * @param distanceFunction Distance function * @return List of chosen means for k-means */ - public abstract List chooseInitialMeans(Relation relation, int k, PrimitiveDistanceFunction distanceFunction); + public abstract List chooseInitialMeans(Relation relation, int k, PrimitiveDistanceFunction distanceFunction); } \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java index fda1d6c0..b1b40632 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java @@ -39,7 +39,6 @@ import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.logging.Logging; -import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; @@ -94,14 +93,13 @@ public class KMeansLloyd, D extends Distance> ex * @param database Database * @param relation relation to use * @return result - * @throws IllegalStateException */ - public Clustering> run(Database database, Relation relation) throws IllegalStateException { + public Clustering> run(Database database, Relation relation) { if(relation.size() <= 0) { return new Clustering>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means - List means = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); + List> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); // Setup cluster assignment store List clusters = new ArrayList(); for(int i = 0; i < k; i++) { @@ -124,7 +122,7 @@ public class KMeansLloyd, D extends Distance> ex final V factory = DatabaseUtil.assumeVectorField(relation).getFactory(); Clustering> result = new Clustering>("k-Means Clustering", "kmeans-clustering"); for(int i = 0; i < clusters.size(); i++) { - MeanModel model = new MeanModel(factory.newNumberVector(means.get(i).getArrayRef())); + MeanModel model = new MeanModel(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef())); result.addCluster(new Cluster>(clusters.get(i), model)); } return result; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java index 56492dd0..c729eb10 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java @@ -93,15 +93,17 @@ public class KMeansMacQueen, D extends Distance> * * @param database Database * @param relation relation to use - * @return result - * @throws IllegalStateException + * @return Clustering result */ - public Clustering> run(Database database, Relation relation) throws IllegalStateException { + public Clustering> run(Database database, Relation relation) { if(relation.size() <= 0) { return new Clustering>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means - List means = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); + List means = new ArrayList(k); + for(NumberVector nv : initializer.chooseInitialMeans(relation, k, getDistanceFunction())) { + means.add(nv.getColumnVector()); + } // Initialize cluster and assign objects List clusters = new ArrayList(); for(int i = 0; i < k; i++) { diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java index c7a2fa1d..9afeff6c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java @@ -26,19 +26,18 @@ import java.util.ArrayList; import java.util.List; import java.util.Random; -import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; -import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.LoggingUtil; -import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; @@ -59,7 +58,7 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; * @param Distance type */ @Reference(authors = "D. Arthur, S. Vassilvitskii", title = "k-means++: the advantages of careful seeding", booktitle = "Proc. of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, SODA 2007", url = "http://dx.doi.org/10.1145/1283383.1283494") -public class KMeansPlusPlusInitialMeans, D extends NumberDistance> extends AbstractKMeansInitialization { +public class KMeansPlusPlusInitialMeans> extends AbstractKMeansInitialization implements KMedoidsInitialization { /** * Constructor. * @@ -70,7 +69,7 @@ public class KMeansPlusPlusInitialMeans, D extends } @Override - public List chooseInitialMeans(Relation relation, int k, PrimitiveDistanceFunction distanceFunction) { + public List chooseInitialMeans(Relation relation, int k, PrimitiveDistanceFunction distanceFunction) { // Get a distance query if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) { throw new AbortException("K-Means++ initialization can only be used with numerical distances."); @@ -80,14 +79,12 @@ public class KMeansPlusPlusInitialMeans, D extends DistanceQuery distQ = relation.getDatabase().getDistanceQuery(relation, distF); // Chose first mean - List means = new ArrayList(k); + List means = new ArrayList(k); Random random = (seed != null) ? new Random(seed) : new Random(); - DBID first = DBIDUtil.randomSample(relation.getDBIDs(), 1, random.nextLong()).iterator().next(); - means.add(relation.get(first).getColumnVector()); + DBID first = DBIDUtil.randomSample(relation.getDBIDs(), 1, random.nextLong()).iter().getDBID(); + means.add(relation.get(first)); - ModifiableDBIDs chosen = DBIDUtil.newHashSet(k); - chosen.add(first); ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs()); // Initialize weights double[] weights = new double[ids.size()]; @@ -107,16 +104,16 @@ public class KMeansPlusPlusInitialMeans, D extends } // Add new mean: DBID newmean = ids.get(pos); - means.add(relation.get(newmean).getColumnVector()); - chosen.add(newmean); + means.add(relation.get(newmean)); // Update weights: weights[pos] = 0.0; // Choose optimized version for double distances, if applicable. - if (distF instanceof PrimitiveDoubleDistanceFunction) { + if(distF instanceof PrimitiveDoubleDistanceFunction) { @SuppressWarnings("unchecked") PrimitiveDoubleDistanceFunction ddist = (PrimitiveDoubleDistanceFunction) distF; weightsum = updateWeights(weights, ids, newmean, ddist, relation); - } else { + } + else { weightsum = updateWeights(weights, ids, newmean, distQ); } } @@ -124,6 +121,48 @@ public class KMeansPlusPlusInitialMeans, D extends return means; } + @Override + public DBIDs chooseInitialMedoids(int k, DistanceQuery distQ2) { + if(!(distQ2.getDistanceFactory() instanceof NumberDistance)) { + throw new AbortException("PAM initialization can only be used with numerical distances."); + } + @SuppressWarnings("unchecked") + DistanceQuery distQ = (DistanceQuery) distQ2; + // Chose first mean + ArrayModifiableDBIDs means = DBIDUtil.newArray(k); + + Random random = (seed != null) ? new Random(seed) : new Random(); + DBID first = DBIDUtil.randomSample(distQ.getRelation().getDBIDs(), 1, random.nextLong()).iter().getDBID(); + means.add(first); + + ArrayDBIDs ids = DBIDUtil.ensureArray(distQ.getRelation().getDBIDs()); + // Initialize weights + double[] weights = new double[ids.size()]; + double weightsum = initialWeights(weights, ids, first, distQ); + while(means.size() < k) { + if(weightsum > Double.MAX_VALUE) { + LoggingUtil.warning("Could not choose a reasonable mean for k-means++ - too many data points, too large squared distances?"); + } + if(weightsum < Double.MIN_NORMAL) { + LoggingUtil.warning("Could not choose a reasonable mean for k-means++ - to few data points?"); + } + double r = random.nextDouble() * weightsum; + int pos = 0; + while(r > 0 && pos < weights.length) { + r -= weights[pos]; + pos++; + } + // Add new mean: + DBID newmean = ids.get(pos); + means.add(newmean); + // Update weights: + weights[pos] = 0.0; + weightsum = updateWeights(weights, ids, newmean, distQ); + } + + return means; + } + /** * Initialize the weight list. * @@ -133,16 +172,15 @@ public class KMeansPlusPlusInitialMeans, D extends * @param distQ Distance query * @return Weight sum */ - protected double initialWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery distQ) { + protected double initialWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery distQ) { double weightsum = 0.0; DBIDIter it = ids.iter(); for(int i = 0; i < weights.length; i++, it.advance()) { - DBID id = it.getDBID(); - if(latest.equals(id)) { + if(latest.sameDBID(it)) { weights[i] = 0.0; } else { - double d = distQ.distance(latest, id).doubleValue(); + double d = distQ.distance(latest, it).doubleValue(); weights[i] = d * d; } weightsum += weights[i]; @@ -159,13 +197,12 @@ public class KMeansPlusPlusInitialMeans, D extends * @param distQ Distance query * @return Weight sum */ - protected double updateWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery distQ) { + protected double updateWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery distQ) { double weightsum = 0.0; DBIDIter it = ids.iter(); for(int i = 0; i < weights.length; i++, it.advance()) { - DBID id = it.getDBID(); if(weights[i] > 0.0) { - double d = distQ.distance(latest, id).doubleValue(); + double d = distQ.distance(latest, it).doubleValue(); weights[i] = Math.min(weights[i], d * d); weightsum += weights[i]; } @@ -187,9 +224,8 @@ public class KMeansPlusPlusInitialMeans, D extends double weightsum = 0.0; DBIDIter it = ids.iter(); for(int i = 0; i < weights.length; i++, it.advance()) { - DBID id = it.getDBID(); if(weights[i] > 0.0) { - double d = distF.doubleDistance(lv, rel.get(id)); + double d = distF.doubleDistance(lv, rel.get(it)); weights[i] = Math.min(weights[i], d * d); weightsum += weights[i]; } @@ -204,7 +240,7 @@ public class KMeansPlusPlusInitialMeans, D extends * * @apiviz.exclude */ - public static class Parameterizer, D extends NumberDistance> extends AbstractKMeansInitialization.Parameterizer { + public static class Parameterizer> extends AbstractKMeansInitialization.Parameterizer { @Override protected KMeansPlusPlusInitialMeans makeInstance() { return new KMeansPlusPlusInitialMeans(seed); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java new file mode 100644 index 00000000..8c284981 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java @@ -0,0 +1,172 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Provides the k-medians clustering algorithm, using Lloyd-style bulk + * iterations. + * + * Reference: + *

+ * Clustering via Concave Minimization
+ * P. S. Bradley, O. L. Mangasarian, W. N. Street
+ * in: Advances in neural information processing systems + *

+ * + * @author Erich Schubert + * + * @apiviz.has MeanModel + * + * @param vector datatype + * @param distance value type + */ +@Title("K-Medians") +@Reference(title = "Clustering via Concave Minimization", authors = "P. S. Bradley, O. L. Mangasarian, W. N. Street", booktitle = "Advances in neural information processing systems", url="http://nips.djvuzone.org/djvu/nips09/0368.djvu") +public class KMediansLloyd, D extends Distance> extends AbstractKMeans implements ClusteringAlgorithm>> { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(KMediansLloyd.class); + + /** + * Constructor. + * + * @param distanceFunction distance function + * @param k k parameter + * @param maxiter Maxiter parameter + */ + public KMediansLloyd(PrimitiveDistanceFunction, D> distanceFunction, int k, int maxiter, KMeansInitialization initializer) { + super(distanceFunction, k, maxiter, initializer); + } + + /** + * Run k-medians + * + * @param database Database + * @param relation relation to use + * @return result + */ + public Clustering> run(Database database, Relation relation) { + if(relation.size() <= 0) { + return new Clustering>("k-Medians Clustering", "kmedians-clustering"); + } + // Choose initial medians + List> medians = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); + // Setup cluster assignment store + List clusters = new ArrayList(); + for(int i = 0; i < k; i++) { + clusters.add(DBIDUtil.newHashSet(relation.size() / k)); + } + + for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { + if(logger.isVerbose()) { + logger.verbose("K-Medians iteration " + (iteration + 1)); + } + boolean changed = assignToNearestCluster(relation, medians, clusters); + // Stop if no cluster assignment changed. + if(!changed) { + break; + } + // Recompute medians. + medians = medians(clusters, medians, relation); + } + // Wrap result + final V factory = DatabaseUtil.assumeVectorField(relation).getFactory(); + Clustering> result = new Clustering>("k-Medians Clustering", "kmedians-clustering"); + for(int i = 0; i < clusters.size(); i++) { + MeanModel model = new MeanModel(factory.newNumberVector(medians.get(i).getColumnVector().getArrayRef())); + result.addCluster(new Cluster>(clusters.get(i), model)); + } + return result; + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer, D extends Distance> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer, D> { + protected int k; + + protected int maxiter; + + protected KMeansInitialization initializer; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter kP = new IntParameter(K_ID, new GreaterConstraint(0)); + if(config.grab(kP)) { + k = kP.getValue(); + } + + ObjectParameter> initialP = new ObjectParameter>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class); + if(config.grab(initialP)) { + initializer = initialP.instantiateClass(config); + } + + IntParameter maxiterP = new IntParameter(MAXITER_ID, new GreaterEqualConstraint(0), 0); + if(config.grab(maxiterP)) { + maxiter = maxiterP.getValue(); + } + } + + @Override + protected AbstractKMeans makeInstance() { + return new KMediansLloyd(distanceFunction, k, maxiter, initializer); + } + } +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java new file mode 100644 index 00000000..a5c3d675 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java @@ -0,0 +1,271 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.model.MedoidModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.Mean; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Provides the k-medoids clustering algorithm, using a "bulk" variation of the + * "Partitioning Around Medoids" approach. + * + * In contrast to PAM, which will in each iteration update one medoid with one + * (arbitrary) non-medoid, this implementation follows the EM pattern. In the + * expectation step, the best medoid from the cluster members is chosen; in the + * M-step, the objects are reassigned to their nearest medoid. + * + * We do not have a reference for this algorithm. It borrows ideas from EM and + * PAM. If needed, you are welcome cite it using the latest ELKI publication + * (this variation is likely not worth publishing on its own). + * + * @author Erich Schubert + * + * @apiviz.has MedoidModel + * @apiviz.composedOf KMedoidsInitialization + * + * @param vector datatype + * @param distance value type + */ +public class KMedoidsEM> extends AbstractDistanceBasedAlgorithm> implements ClusteringAlgorithm> { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(KMedoidsEM.class); + + /** + * Holds the value of {@link AbstractKMeans#K_ID}. + */ + protected int k; + + /** + * Holds the value of {@link AbstractKMeans#MAXITER_ID}. + */ + protected int maxiter; + + /** + * Method to choose initial means. + */ + protected KMedoidsInitialization initializer; + + /** + * Constructor. + * + * @param distanceFunction distance function + * @param k k parameter + * @param maxiter Maxiter parameter + * @param initializer Function to generate the initial means + */ + public KMedoidsEM(PrimitiveDistanceFunction distanceFunction, int k, int maxiter, KMedoidsInitialization initializer) { + super(distanceFunction); + this.k = k; + this.maxiter = maxiter; + this.initializer = initializer; + } + + /** + * Run k-medoids + * + * @param database Database + * @param relation relation to use + * @return result + */ + public Clustering run(Database database, Relation relation) { + if(relation.size() <= 0) { + return new Clustering("k-Medoids Clustering", "kmedoids-clustering"); + } + DistanceQuery distQ = database.getDistanceQuery(relation, getDistanceFunction()); + // Choose initial medoids + ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ)); + // Setup cluster assignment store + List clusters = new ArrayList(); + for(int i = 0; i < k; i++) { + clusters.add(DBIDUtil.newHashSet(relation.size() / k)); + } + Mean[] mdists = Mean.newArray(k); + + // Initial assignment to nearest medoids + // TODO: reuse this information, from the build phase, when possible? + assignToNearestCluster(medoids, mdists, clusters, distQ); + + // Swap phase + boolean changed = true; + while(changed) { + changed = false; + // Try to swap the medoid with a better cluster member: + for(int i = 0; i < k; i++) { + DBID med = medoids.get(i); + DBID best = null; + Mean bestm = mdists[i]; + for(DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) { + if(med.sameDBID(iter)) { + continue; + } + Mean mdist = new Mean(); + for(DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) { + mdist.put(distQ.distance(iter, iter2).doubleValue()); + } + if(mdist.getMean() < bestm.getMean()) { + best = iter.getDBID(); + bestm = mdist; + } + } + if(best != null && !med.sameDBID(best)) { + changed = true; + medoids.set(i, best); + mdists[i] = bestm; + } + } + // Reassign + if(changed) { + assignToNearestCluster(medoids, mdists, clusters, distQ); + } + } + + // Wrap result + Clustering result = new Clustering("k-Medoids Clustering", "kmedoids-clustering"); + for(int i = 0; i < clusters.size(); i++) { + MedoidModel model = new MedoidModel(medoids.get(i)); + result.addCluster(new Cluster(clusters.get(i), model)); + } + return result; + } + + /** + * Returns a list of clusters. The kth cluster contains the ids of + * those FeatureVectors, that are nearest to the kth mean. + * + * @param means a list of k means + * @param mdist Mean distances + * @param clusters cluster assignment + * @param distQ distance query + * @return true when the object was reassigned + */ + protected boolean assignToNearestCluster(ArrayDBIDs means, Mean[] mdist, List clusters, DistanceQuery distQ) { + boolean changed = false; + + double[] dists = new double[k]; + for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { + int minIndex = 0; + double mindist = Double.POSITIVE_INFINITY; + for(int i = 0; i < k; i++) { + dists[i] = distQ.distance(iditer, means.get(i)).doubleValue(); + if(dists[i] < mindist) { + minIndex = i; + mindist = dists[i]; + } + } + if(clusters.get(minIndex).add(iditer)) { + changed = true; + mdist[minIndex].put(mindist); + // Remove from previous cluster + // TODO: keep a list of cluster assignments to save this search? + for(int i = 0; i < k; i++) { + if(i != minIndex) { + if(clusters.get(i).remove(iditer)) { + mdist[minIndex].put(dists[i], -1); + break; + } + } + } + } + } + return changed; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer { + protected int k; + + protected int maxiter; + + protected KMedoidsInitialization initializer; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter kP = new IntParameter(KMeans.K_ID, new GreaterConstraint(0)); + if(config.grab(kP)) { + k = kP.getValue(); + } + + ObjectParameter> initialP = new ObjectParameter>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class); + if(config.grab(initialP)) { + initializer = initialP.instantiateClass(config); + } + + IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, new GreaterEqualConstraint(0), 0); + if(config.grab(maxiterP)) { + maxiter = maxiterP.getValue(); + } + } + + @Override + protected KMedoidsEM makeInstance() { + return new KMedoidsEM(distanceFunction, k, maxiter, initializer); + } + } +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java new file mode 100644 index 00000000..269e7e9e --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java @@ -0,0 +1,45 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ + +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; + +/** + * Interface for initializing K-Medoids. In contrast to k-means initializers, + * this initialization will only return members of the original data set. + * + * @author Erich Schubert + * + * @param Object type + */ +public interface KMedoidsInitialization { + /** + * Choose initial means + * + * @param k Parameter k + * @param distanceFunction Distance function + * @return List of chosen means for k-means + */ + public abstract DBIDs chooseInitialMedoids(int k, DistanceQuery distanceFunction); +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java new file mode 100644 index 00000000..30c80084 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java @@ -0,0 +1,310 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.model.MedoidModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Provides the k-medoids clustering algorithm, using the + * "Partitioning Around Medoids" approach. + * + * Reference: + *

+ * Clustering my means of Medoids
+ * Kaufman, L. and Rousseeuw, P.J.
+ * in: Statistical Data Analysis Based on the L_1–Norm and Related Methods + *

+ * + * @author Erich Schubert + * + * @apiviz.has MedoidModel + * @apiviz.composedOf KMedoidsInitialization + * + * @param vector datatype + * @param distance value type + */ +@Title("Partioning Around Medoids") +@Reference(title = "Clustering my means of Medoids", authors = "Kaufman, L. and Rousseeuw, P.J.", booktitle = "Statistical Data Analysis Based on the L_1–Norm and Related Methods") +public class KMedoidsPAM> extends AbstractDistanceBasedAlgorithm> implements ClusteringAlgorithm> { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(KMedoidsPAM.class); + + /** + * Holds the value of {@link AbstractKMeans#K_ID}. + */ + protected int k; + + /** + * Holds the value of {@link AbstractKMeans#MAXITER_ID}. + */ + protected int maxiter; + + /** + * Method to choose initial means. + */ + protected KMedoidsInitialization initializer; + + /** + * Constructor. + * + * @param distanceFunction distance function + * @param k k parameter + * @param maxiter Maxiter parameter + * @param initializer Function to generate the initial means + */ + public KMedoidsPAM(PrimitiveDistanceFunction distanceFunction, int k, int maxiter, KMedoidsInitialization initializer) { + super(distanceFunction); + this.k = k; + this.maxiter = maxiter; + this.initializer = initializer; + } + + /** + * Run k-medoids + * + * @param database Database + * @param relation relation to use + * @return result + */ + public Clustering run(Database database, Relation relation) { + if(relation.size() <= 0) { + return new Clustering("k-Medoids Clustering", "kmedoids-clustering"); + } + DistanceQuery distQ = database.getDistanceQuery(relation, getDistanceFunction()); + DBIDs ids = relation.getDBIDs(); + // Choose initial medoids + ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ)); + // Setup cluster assignment store + List clusters = new ArrayList(); + for(int i = 0; i < k; i++) { + clusters.add(DBIDUtil.newHashSet(relation.size() / k)); + } + + WritableDoubleDataStore second = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); + // Initial assignment to nearest medoids + // TODO: reuse this information, from the build phase, when possible? + assignToNearestCluster(medoids, ids, second, clusters, distQ); + + // Swap phase + boolean changed = true; + while(changed) { + changed = false; + // Try to swap the medoid with a better cluster member: + double best = 0; + DBID bestid = null; + int bestcluster = -1; + for(int i = 0; i < k; i++) { + DBID med = medoids.get(i); + for(DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) { + if(med.sameDBID(iter)) { + continue; + } + // double disti = distQ.distance(id, med).doubleValue(); + double cost = 0; + for(int j = 0; j < k; j++) { + for(DBIDIter iter2 = clusters.get(j).iter(); iter2.valid(); iter2.advance()) { + double distcur = distQ.distance(iter2, medoids.get(j)).doubleValue(); + double distnew = distQ.distance(iter2, iter).doubleValue(); + if(j == i) { + // Cases 1 and 2. + double distsec = second.doubleValue(iter2); + if(distcur > distsec) { + // Case 1, other would switch to a third medoid + cost += distsec - distcur; // Always positive! + } + else { // Would remain with the candidate + cost += distnew - distcur; // Could be negative + } + } + else { + // Cases 3-4: objects from other clusters + if (distcur < distnew) { + // Case 3: no change + } else { + // Case 4: would switch to new medoid + cost += distnew - distcur; // Always negative + } + } + } + } + if (cost < best) { + best = cost; + bestid = iter.getDBID(); + bestcluster = i; + } + } + } + if(logger.isDebugging()) { + logger.debug("Best cost: " + best); + } + if(bestid != null) { + changed = true; + medoids.set(bestcluster, bestid); + } + // Reassign + if(changed) { + // TODO: can we save some of these recomputations? + assignToNearestCluster(medoids, ids, second, clusters, distQ); + } + } + + // Wrap result + Clustering result = new Clustering("k-Medoids Clustering", "kmedoids-clustering"); + for(int i = 0; i < clusters.size(); i++) { + MedoidModel model = new MedoidModel(medoids.get(i)); + result.addCluster(new Cluster(clusters.get(i), model)); + } + return result; + } + + /** + * Returns a list of clusters. The kth cluster contains the ids of + * those FeatureVectors, that are nearest to the kth mean. + * + * @param means Object centroids + * @param ids Object ids + * @param second Distance to second nearest medoid + * @param clusters cluster assignment + * @param distQ distance query + * @return true when any object was reassigned + */ + protected boolean assignToNearestCluster(ArrayDBIDs means, DBIDs ids, WritableDoubleDataStore second, List clusters, DistanceQuery distQ) { + boolean changed = false; + + for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { + int minIndex = 0; + double mindist = Double.POSITIVE_INFINITY; + double mindist2 = Double.POSITIVE_INFINITY; + for(int i = 0; i < k; i++) { + double dist = distQ.distance(iditer, means.get(i)).doubleValue(); + if(dist < mindist) { + minIndex = i; + mindist2 = mindist; + mindist = dist; + } + else if(dist < mindist2) { + mindist2 = dist; + } + } + if(clusters.get(minIndex).add(iditer)) { + changed = true; + // Remove from previous cluster + // TODO: keep a list of cluster assignments to save this search? + for(int i = 0; i < k; i++) { + if(i != minIndex) { + if(clusters.get(i).remove(iditer)) { + break; + } + } + } + } + second.put(iditer, mindist2); + } + return changed; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer { + protected int k; + + protected int maxiter; + + protected KMedoidsInitialization initializer; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter kP = new IntParameter(KMeans.K_ID, new GreaterConstraint(0)); + if(config.grab(kP)) { + k = kP.getValue(); + } + + ObjectParameter> initialP = new ObjectParameter>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class); + if(config.grab(initialP)) { + initializer = initialP.instantiateClass(config); + } + + IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, new GreaterEqualConstraint(0), 0); + if(config.grab(maxiterP)) { + maxiter = maxiterP.getValue(); + } + } + + @Override + protected KMedoidsPAM makeInstance() { + return new KMedoidsPAM(distanceFunction, k, maxiter, initializer); + } + } +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java new file mode 100644 index 00000000..094c37bb --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java @@ -0,0 +1,187 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.math.Mean; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * PAM initialization for k-means (and of course, PAM). + * + * Reference: + *

+ * Clustering my means of Medoids
+ * Kaufman, L. and Rousseeuw, P.J.
+ * in: Statistical Data Analysis Based on the L_1–Norm and Related Methods + *

+ * + * TODO: enforce using a distance matrix? + * + * @author Erich Schubert + * + * @param Vector type + * @param Distance type + */ +@Reference(title = "Clustering my means of Medoids", authors = "Kaufman, L. and Rousseeuw, P.J.", booktitle = "Statistical Data Analysis Based on the L_1–Norm and Related Methods") +public class PAMInitialMeans> implements KMeansInitialization, KMedoidsInitialization { + /** + * Constructor. + */ + public PAMInitialMeans() { + super(); + } + + @Override + public List chooseInitialMeans(Relation relation, int k, PrimitiveDistanceFunction distanceFunction) { + // Get a distance query + if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) { + throw new AbortException("PAM initialization can only be used with numerical distances."); + } + @SuppressWarnings("unchecked") + final PrimitiveDistanceFunction distF = (PrimitiveDistanceFunction) distanceFunction; + final DistanceQuery distQ = relation.getDatabase().getDistanceQuery(relation, distF); + DBIDs medids = chooseInitialMedoids(k, distQ); + List medoids = new ArrayList(k); + for(DBIDIter iter = medids.iter(); iter.valid(); iter.advance()) { + medoids.add(relation.get(iter)); + } + return medoids; + } + + @Override + public DBIDs chooseInitialMedoids(int k, DistanceQuery distQ2) { + if(!(distQ2.getDistanceFactory() instanceof NumberDistance)) { + throw new AbortException("PAM initialization can only be used with numerical distances."); + } + @SuppressWarnings("unchecked") + DistanceQuery distQ = (DistanceQuery) distQ2; + final DBIDs ids = distQ.getRelation().getDBIDs(); + + ArrayModifiableDBIDs medids = DBIDUtil.newArray(k); + double best = Double.POSITIVE_INFINITY; + Mean mean = new Mean(); // Mean is numerically more stable than sum. + WritableDoubleDataStore mindist = null; + + // First mean is chosen by having the smallest distance sum to all others. + { + DBID bestid = null; + WritableDoubleDataStore bestd = null; + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + WritableDoubleDataStore newd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); + mean.reset(); + for(DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) { + double d = distQ.distance(iter, iter2).doubleValue(); + mean.put(d); + newd.putDouble(iter2, d); + } + if(mean.getMean() < best) { + best = mean.getMean(); + bestid = iter.getDBID(); + if(bestd != null) { + bestd.destroy(); + } + bestd = newd; + } + else { + newd.destroy(); + } + } + medids.add(bestid); + mindist = bestd; + } + assert (mindist != null); + + // Subsequent means optimize the full criterion. + for(int i = 1; i < k; i++) { + DBID bestid = null; + WritableDoubleDataStore bestd = null; + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + DBID id = iter.getDBID(); + if(medids.contains(id)) { + continue; + } + WritableDoubleDataStore newd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); + mean.reset(); + for(DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) { + DBID other = iter2.getDBID(); + double dn = distQ.distance(id, other).doubleValue(); + double v = Math.min(dn, mindist.doubleValue(other)); + mean.put(v); + newd.put(other, v); + } + assert (mean.getCount() == ids.size()); + if(mean.getMean() < best) { + best = mean.getMean(); + bestid = id; + if(bestd != null) { + bestd.destroy(); + } + bestd = newd; + } + else { + newd.destroy(); + } + } + if(bestid == null) { + throw new AbortException("No median found that improves the criterion function?!?"); + } + medids.add(bestid); + mindist.destroy(); + mindist = bestd; + } + + mindist.destroy(); + return medids; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer> extends AbstractParameterizer { + @Override + protected PAMInitialMeans makeInstance() { + return new PAMInitialMeans(); + } + } +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java index 30e59453..5b9da923 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java @@ -25,13 +25,12 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; import java.util.ArrayList; import java.util.List; -import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; /** * Initialize K-means by randomly choosing k exsiting elements as cluster @@ -41,7 +40,7 @@ import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; * * @param Vector type */ -public class RandomlyChosenInitialMeans> extends AbstractKMeansInitialization { +public class RandomlyChosenInitialMeans extends AbstractKMeansInitialization implements KMedoidsInitialization { /** * Constructor. * @@ -52,15 +51,20 @@ public class RandomlyChosenInitialMeans> extends Ab } @Override - public List chooseInitialMeans(Relation relation, int k, PrimitiveDistanceFunction distanceFunction) { + public List chooseInitialMeans(Relation relation, int k, PrimitiveDistanceFunction distanceFunction) { DBIDs ids = DBIDUtil.randomSample(relation.getDBIDs(), k, seed); - List means = new ArrayList(k); - for(DBID id : ids) { - means.add(relation.get(id).getColumnVector()); + List means = new ArrayList(k); + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + means.add(relation.get(iter)); } return means; } + @Override + public DBIDs chooseInitialMedoids(int k, DistanceQuery distanceFunction) { + return DBIDUtil.randomSample(distanceFunction.getRelation().getDBIDs(), k, seed); + } + /** * Parameterization class. * @@ -68,7 +72,7 @@ public class RandomlyChosenInitialMeans> extends Ab * * @apiviz.exclude */ - public static class Parameterizer> extends AbstractKMeansInitialization.Parameterizer { + public static class Parameterizer extends AbstractKMeansInitialization.Parameterizer { @Override protected RandomlyChosenInitialMeans makeInstance() { diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java index e8a466dd..00ed08c4 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java @@ -30,7 +30,6 @@ import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.math.MathUtil; -import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; @@ -53,10 +52,10 @@ public class RandomlyGeneratedInitialMeans> extends } @Override - public List chooseInitialMeans(Relation relation, int k, PrimitiveDistanceFunction distanceFunction) { + public List chooseInitialMeans(Relation relation, int k, PrimitiveDistanceFunction distanceFunction) { final int dim = DatabaseUtil.dimensionality(relation); Pair minmax = DatabaseUtil.computeMinMax(relation); - List means = new ArrayList(k); + List means = new ArrayList(k); final Random random = (this.seed != null) ? new Random(this.seed) : new Random(); for(int i = 0; i < k; i++) { double[] r = MathUtil.randomDoubleArray(dim, random); @@ -64,12 +63,11 @@ public class RandomlyGeneratedInitialMeans> extends for(int d = 0; d < dim; d++) { r[d] = minmax.first.doubleValue(d + 1) + (minmax.second.doubleValue(d + 1) - minmax.first.doubleValue(d + 1)) * r[d]; } - means.add(new Vector(r)); + means.add(minmax.first.newNumberVector(r)); } return means; } - /** * Parameterization class. * @@ -78,7 +76,6 @@ public class RandomlyGeneratedInitialMeans> extends * @apiviz.exclude */ public static class Parameterizer> extends AbstractKMeansInitialization.Parameterizer { - @Override protected RandomlyGeneratedInitialMeans makeInstance() { return new RandomlyGeneratedInitialMeans(seed); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java index e3b274a6..01a693e4 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java @@ -27,14 +27,12 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.SortedMap; import java.util.TreeMap; import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique.CLIQUESubspace; import de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique.CLIQUEUnit; import de.lmu.ifi.dbs.elki.data.Cluster; @@ -46,6 +44,7 @@ import de.lmu.ifi.dbs.elki.data.model.SubspaceModel; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.logging.Logging; @@ -97,7 +96,7 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; @Title("CLIQUE: Automatic Subspace Clustering of High Dimensional Data for Data Mining Applications") @Description("Grid-based algorithm to identify dense clusters in subspaces of maximum dimensionality.") @Reference(authors = "R. Agrawal, J. Gehrke, D. Gunopulos, P. Raghavan", title = "Automatic Subspace Clustering of High Dimensional Data for Data Mining Applications", booktitle = "Proc. SIGMOD Conference, Seattle, WA, 1998", url = "http://dx.doi.org/10.1145/276304.276314") -public class CLIQUE> extends AbstractAlgorithm>> implements ClusteringAlgorithm>> { +public class CLIQUE> extends AbstractAlgorithm>> implements SubspaceClusteringAlgorithm> { /** * The logger for this class. */ @@ -299,8 +298,8 @@ public class CLIQUE> extends AbstractAlgorithm it = database.iterDBIDs(); it.hasNext();) { - V featureVector = database.get(it.next()); + for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { + V featureVector = database.get(it); updateMinMax(featureVector, minima, maxima); } for(int i = 0; i < maxima.length; i++) { @@ -393,13 +392,15 @@ public class CLIQUE> extends AbstractAlgorithm it = database.iterDBIDs(); it.hasNext();) { - final DBID id = it.next(); - V featureVector = database.get(id); + for(DBIDIter it = database.iterDBIDs(); it.valid();) { + V featureVector = database.get(it); + final DBID id = it.getDBID(); + it.advance(); for(CLIQUEUnit unit : units) { unit.addFeatureVector(id, featureVector); // unit is a dense unit - if(!it.hasNext() && unit.selectivity(total) >= tau) { + // FIXME: why it.valid()? + if(!it.valid() && unit.selectivity(total) >= tau) { denseUnits.add(unit); // add the dense unit to its subspace int dim = unit.getIntervals().iterator().next().getDimension(); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java index c4c1687b..df3fe8b5 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java @@ -34,7 +34,6 @@ import java.util.List; import java.util.Map; import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.OPTICS; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; @@ -100,7 +99,7 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; @Title("DiSH: Detecting Subspace cluster Hierarchies") @Description("Algorithm to find hierarchical correlation clusters in subspaces.") @Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, I. Müller-Gorman, A. Zimek", title = "Detection and Visualization of Subspace Cluster Hierarchies", booktitle = "Proc. 12th International Conference on Database Systems for Advanced Applications (DASFAA), Bangkok, Thailand, 2007", url = "http://dx.doi.org/10.1007/978-3-540-71703-4_15") -public class DiSH> extends AbstractAlgorithm>> implements ClusteringAlgorithm>> { +public class DiSH> extends AbstractAlgorithm>> implements SubspaceClusteringAlgorithm> { /** * The logger for this class. */ @@ -162,8 +161,11 @@ public class DiSH> extends AbstractAlgorithm> run(Database database, Relation relation) throws IllegalStateException { + public Clustering> run(Database database, Relation relation) { // Instantiate DiSH distance (and thus run the preprocessor) if(logger.isVerbose()) { logger.verbose("*** Run DiSH preprocessor."); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java index 3f16e907..4eedbecd 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java @@ -28,7 +28,6 @@ import java.util.BitSet; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Random; @@ -39,13 +38,13 @@ import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.Subspace; -import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.data.model.SubspaceModel; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; @@ -87,11 +86,11 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; * * @param the type of NumberVector handled by this Algorithm */ +// TODO: optimize by creating much less objects @Title("PROCLUS: PROjected CLUStering") @Description("Algorithm to find subspace clusters in high dimensional spaces.") @Reference(authors = "C. C. Aggarwal, C. Procopiuc, J. L. Wolf, P. S. Yu, J. S. Park", title = "Fast Algorithms for Projected Clustering", booktitle = "Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '99)", url = "http://dx.doi.org/10.1145/304181.304188") -// TODO: make the generics reflect the SubspaceModel -public class PROCLUS> extends AbstractProjectedClustering, V> { +public class PROCLUS> extends AbstractProjectedClustering>, V> implements SubspaceClusteringAlgorithm> { /** * The logger for this class. */ @@ -141,8 +140,11 @@ public class PROCLUS> extends AbstractProjectedClus /** * Performs the PROCLUS algorithm on the given database. + * + * @param database Database to process + * @param relation Relation to process */ - public Clustering run(Database database, Relation relation) throws IllegalStateException { + public Clustering> run(Database database, Relation relation) { DistanceQuery distFunc = this.getDistanceQuery(database); RangeQuery rangeQuery = database.getRangeQuery(distFunc); final Random random = new Random(); @@ -193,6 +195,7 @@ public class PROCLUS> extends AbstractProjectedClus IndefiniteProgress cprogress = logger.isVerbose() ? new IndefiniteProgress("Current number of clusters:", logger) : null; + // TODO: Use DataStore and Trove for performance Map clusters = null; int loops = 0; while(loops < 10) { @@ -229,9 +232,9 @@ public class PROCLUS> extends AbstractProjectedClus // build result int numClusters = 1; - Clustering result = new Clustering("ProClus clustering", "proclus-clustering"); + Clustering> result = new Clustering>("ProClus clustering", "proclus-clustering"); for(PROCLUSCluster c : finalClusters) { - Cluster cluster = new Cluster(c.objectIDs); + Cluster> cluster = new Cluster>(c.objectIDs); cluster.setModel(new SubspaceModel(new Subspace(c.getDimensions()), c.centroid)); cluster.setName("cluster_" + numClusters++); @@ -262,7 +265,8 @@ public class PROCLUS> extends AbstractProjectedClus // compute distances between each point in S and m_i Map> distances = new HashMap>(); - for(DBID id : s) { + for(DBIDIter iter = s.iter(); iter.valid(); iter.advance()) { + DBID id = iter.getDBID(); DoubleDistance dist = distFunc.distance(id, m_i); distances.put(id, new GenericDistanceResultPair(dist, id)); } @@ -278,7 +282,8 @@ public class PROCLUS> extends AbstractProjectedClus distances.remove(m_i); // compute distances of each point to closest medoid - for(DBID id : s) { + for(DBIDIter iter = s.iter(); iter.valid(); iter.advance()) { + DBID id = iter.getDBID(); DoubleDistance dist_new = distFunc.distance(id, m_i); DoubleDistance dist_old = distances.get(id).getDistance(); @@ -323,12 +328,11 @@ public class PROCLUS> extends AbstractProjectedClus */ private ModifiableDBIDs computeM_current(DBIDs m, DBIDs m_best, DBIDs m_bad, Random random) { ArrayModifiableDBIDs m_list = DBIDUtil.newArray(m); - for(DBID m_i : m_best) { - m_list.remove(m_i); - } + m_list.removeDBIDs(m_best); ModifiableDBIDs m_current = DBIDUtil.newHashSet(); - for(DBID m_i : m_best) { + for(DBIDIter iter = m_best.iter(); iter.valid(); iter.advance()) { + DBID m_i = iter.getDBID(); if(m_bad.contains(m_i)) { int currentSize = m_current.size(); while(m_current.size() == currentSize) { @@ -358,11 +362,13 @@ public class PROCLUS> extends AbstractProjectedClus private Map>> getLocalities(DBIDs medoids, Relation database, DistanceQuery distFunc, RangeQuery rangeQuery) { Map>> result = new HashMap>>(); - for(DBID m : medoids) { + for(DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) { + DBID m = iter.getDBID(); // determine minimum distance between current medoid m and any other // medoid m_i DoubleDistance minDist = null; - for(DBID m_i : medoids) { + for(DBIDIter iter2 = medoids.iter(); iter2.valid(); iter2.advance()) { + DBID m_i = iter2.getDBID(); if(m_i == m) { continue; } @@ -399,7 +405,8 @@ public class PROCLUS> extends AbstractProjectedClus int dim = DatabaseUtil.dimensionality(database); Map averageDistances = new HashMap(); - for(DBID m_i : medoids) { + for(DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) { + DBID m_i = iter.getDBID(); V medoid_i = database.get(m_i); List> l_i = localities.get(m_i); double[] x_i = new double[dim]; @@ -417,7 +424,8 @@ public class PROCLUS> extends AbstractProjectedClus Map> dimensionMap = new HashMap>(); List> z_ijs = new ArrayList>(); - for(DBID m_i : medoids) { + for(DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) { + DBID m_i = iter.getDBID(); Set dims_i = new HashSet(); dimensionMap.put(m_i, dims_i); @@ -478,8 +486,8 @@ public class PROCLUS> extends AbstractProjectedClus for(int i = 0; i < clusters.size(); i++) { PROCLUSCluster c_i = clusters.get(i); double[] x_i = new double[dim]; - for(DBID id : c_i.objectIDs) { - V o = database.get(id); + for(DBIDIter iter = c_i.objectIDs.iter(); iter.valid(); iter.advance()) { + V o = database.get(iter); for(int d = 0; d < dim; d++) { x_i[d] += Math.abs(c_i.centroid.doubleValue(d + 1) - o.doubleValue(d + 1)); } @@ -560,8 +568,8 @@ public class PROCLUS> extends AbstractProjectedClus clusterIDs.put(m_i, DBIDUtil.newHashSet()); } - for(Iterator it = database.iterDBIDs(); it.hasNext();) { - DBID p_id = it.next(); + for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { + DBID p_id = it.getDBID(); V p = database.get(p_id); DistanceResultPair minDist = null; for(DBID m_i : dimensions.keySet()) { @@ -610,8 +618,8 @@ public class PROCLUS> extends AbstractProjectedClus clusterIDs.put(i, DBIDUtil.newHashSet()); } - for(Iterator it = database.iterDBIDs(); it.hasNext();) { - DBID p_id = it.next(); + for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { + DBID p_id = it.getDBID(); V p = database.get(p_id); Pair minDist = null; for(int i = 0; i < dimensions.size(); i++) { @@ -707,8 +715,8 @@ public class PROCLUS> extends AbstractProjectedClus */ private double avgDistance(V centroid, DBIDs objectIDs, Relation database, int dimension) { double avg = 0; - for(DBID objectID : objectIDs) { - V o = database.get(objectID); + for(DBIDIter iter = objectIDs.iter(); iter.valid(); iter.advance()) { + V o = database.get(iter); avg += Math.abs(centroid.doubleValue(dimension) - o.doubleValue(dimension)); } return avg / objectIDs.size(); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java index 963c0922..c47c74b6 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java @@ -30,7 +30,6 @@ import java.util.List; import java.util.TreeMap; import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; @@ -77,7 +76,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * @author Elke Achtert * * @apiviz.uses DBSCAN - * @apiviz.uses DimensionsSelectingEuclideanDistanceFunction + * @apiviz.uses AbstractDimensionsSelectingDoubleDistanceFunction * @apiviz.has SubspaceModel * * @param the type of FeatureVector handled by this Algorithm @@ -85,7 +84,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; @Title("SUBCLU: Density connected Subspace Clustering") @Description("Algorithm to detect arbitrarily shaped and positioned clusters in subspaces. SUBCLU delivers for each subspace the same clusters DBSCAN would have found, when applied to this subspace seperately.") @Reference(authors = "K. Kailing, H.-P. Kriegel, P. Kröger", title = "Density connected Subspace Clustering for High Dimensional Data. ", booktitle = "Proc. SIAM Int. Conf. on Data Mining (SDM'04), Lake Buena Vista, FL, 2004") -public class SUBCLU> extends AbstractAlgorithm>> implements ClusteringAlgorithm>> { +public class SUBCLU> extends AbstractAlgorithm>> implements SubspaceClusteringAlgorithm> { /** * The logger for this class. */ @@ -162,7 +161,7 @@ public class SUBCLU> extends AbstractAlgorithm> run(Relation relation) throws IllegalStateException { + public Clustering> run(Relation relation) { final int dimensionality = DatabaseUtil.dimensionality(relation); StepProgress stepprog = logger.isVerbose() ? new StepProgress(dimensionality) : null; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java new file mode 100644 index 00000000..17eb3c19 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java @@ -0,0 +1,39 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.model.SubspaceModel; + +/** + * Interface for subspace clustering algorithms that use a model derived from + * {@link SubspaceModel}, that can then be post-processed for outlier detection. + * + * @author Erich Schubert + * + * @param Model type + */ +public interface SubspaceClusteringAlgorithm> extends ClusteringAlgorithm> { + // No additional constraints +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java index 43c6a218..ee42a59f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java @@ -39,7 +39,11 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.logging.Logging; @@ -137,12 +141,12 @@ public class ByLabelClustering extends AbstractAlgorithm> impl * @param relation The data input we use */ public Clustering run(Relation relation) { - HashMap labelMap = multiple ? multipleAssignment(relation) : singleAssignment(relation); + HashMap labelMap = multiple ? multipleAssignment(relation) : singleAssignment(relation); ModifiableDBIDs noiseids = DBIDUtil.newArray(); Clustering result = new Clustering("By Label Clustering", "bylabel-clustering"); - for(Entry entry : labelMap.entrySet()) { - ModifiableDBIDs ids = entry.getValue(); + for(Entry entry : labelMap.entrySet()) { + DBIDs ids = entry.getValue(); if(ids.size() <= 1) { noiseids.addDBIDs(ids); continue; @@ -170,12 +174,13 @@ public class ByLabelClustering extends AbstractAlgorithm> impl * @param data the database storing the objects * @return a mapping of labels to ids */ - private HashMap singleAssignment(Relation data) { - HashMap labelMap = new HashMap(); + private HashMap singleAssignment(Relation data) { + HashMap labelMap = new HashMap(); - for(DBID id : data.iterDBIDs()) { - String label = data.get(id).toString(); - assign(labelMap, label, id); + for(DBIDIter iditer = data.iterDBIDs(); iditer.valid(); iditer.advance()) { + final Object val = data.get(iditer); + String label = (val != null) ? val.toString() : null; + assign(labelMap, label, iditer); } return labelMap; } @@ -187,13 +192,13 @@ public class ByLabelClustering extends AbstractAlgorithm> impl * @param data the database storing the objects * @return a mapping of labels to ids */ - private HashMap multipleAssignment(Relation data) { - HashMap labelMap = new HashMap(); + private HashMap multipleAssignment(Relation data) { + HashMap labelMap = new HashMap(); - for(DBID id : data.iterDBIDs()) { - String[] labels = data.get(id).toString().split(" "); + for(DBIDIter iditer = data.iterDBIDs(); iditer.valid(); iditer.advance()) { + String[] labels = data.get(iditer).toString().split(" "); for(String label : labels) { - assign(labelMap, label, id); + assign(labelMap, label, iditer); } } return labelMap; @@ -206,14 +211,22 @@ public class ByLabelClustering extends AbstractAlgorithm> impl * @param label the label of the object to be assigned * @param id the id of the object to be assigned */ - private void assign(HashMap labelMap, String label, DBID id) { + private void assign(HashMap labelMap, String label, DBIDRef id) { if(labelMap.containsKey(label)) { - labelMap.get(label).add(id); + DBIDs exist = labelMap.get(label); + if (exist instanceof DBID) { + ModifiableDBIDs n = DBIDUtil.newHashSet(); + n.add((DBID)exist); + n.add(id); + labelMap.put(label, n); + } else { + assert(exist instanceof HashSetModifiableDBIDs); + assert (exist.size() > 1); + ((ModifiableDBIDs)exist).add(id); + } } else { - ModifiableDBIDs n = DBIDUtil.newHashSet(); - n.add(id); - labelMap.put(label, n); + labelMap.put(label, id.getDBID()); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java index 228cc7e7..26bf525a 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java @@ -39,7 +39,11 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.logging.Logging; @@ -96,28 +100,26 @@ public class ByLabelHierarchicalClustering extends AbstractAlgorithm run(Relation relation) throws IllegalStateException { - HashMap labelmap = new HashMap(); + public Clustering run(Relation relation) { + HashMap labelmap = new HashMap(); ModifiableDBIDs noiseids = DBIDUtil.newArray(); - for(DBID id : relation.iterDBIDs()) { - String label = relation.get(id).toString(); - - if(labelmap.containsKey(label)) { - labelmap.get(label).add(id); - } - else { - ModifiableDBIDs n = DBIDUtil.newHashSet(); - n.add(id); - labelmap.put(label, n); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + final Object val = relation.get(iditer); + if(val == null) { + noiseids.add(iditer); + continue; } + String label = val.toString(); + + assign(labelmap, label, iditer); } ArrayList> clusters = new ArrayList>(labelmap.size()); - for(Entry entry : labelmap.entrySet()) { - ModifiableDBIDs ids = entry.getValue(); - if(ids.size() <= 1) { - noiseids.addDBIDs(ids); + for(Entry entry : labelmap.entrySet()) { + DBIDs ids = entry.getValue(); + if(ids instanceof DBID) { + noiseids.add((DBID) ids); continue; } Cluster clus = new Cluster(entry.getKey(), ids, ClusterModel.CLUSTER, new ArrayList>(), new ArrayList>()); @@ -153,6 +155,33 @@ public class ByLabelHierarchicalClustering extends AbstractAlgorithm("By Label Hierarchical Clustering", "bylabel-clustering", rootclusters); } + /** + * Assigns the specified id to the labelMap according to its label + * + * @param labelMap the mapping of label to ids + * @param label the label of the object to be assigned + * @param id the id of the object to be assigned + */ + private void assign(HashMap labelMap, String label, DBIDRef id) { + if(labelMap.containsKey(label)) { + DBIDs exist = labelMap.get(label); + if(exist instanceof DBID) { + ModifiableDBIDs n = DBIDUtil.newHashSet(); + n.add((DBID) exist); + n.add(id); + labelMap.put(label, n); + } + else { + assert (exist instanceof HashSetModifiableDBIDs); + assert (exist.size() > 1); + ((ModifiableDBIDs) exist).add(id); + } + } + else { + labelMap.put(label, id.getDBID()); + } + } + @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(TypeUtil.GUESSED_LABEL); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelOrAllInOneClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelOrAllInOneClustering.java new file mode 100644 index 00000000..f082db9c --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelOrAllInOneClustering.java @@ -0,0 +1,74 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.trivial; + +import de.lmu.ifi.dbs.elki.data.ClassLabel; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.model.ClusterModel; +import de.lmu.ifi.dbs.elki.data.model.Model; +import de.lmu.ifi.dbs.elki.data.type.NoSupportedDataTypeException; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ + +/** + * Trivial class that will try to cluster by label, and fall back to an + * "all-in-one" clustering. + * + * @author Erich Schubert + */ +public class ByLabelOrAllInOneClustering extends ByLabelClustering { + /** + * Constructor. + */ + public ByLabelOrAllInOneClustering() { + super(); + } + + @Override + public Clustering run(Database database) { + // Prefer a true class label + try { + Relation relation = database.getRelation(TypeUtil.CLASSLABEL); + return run(relation); + } + catch(NoSupportedDataTypeException e) { + // Ignore. + } + try { + Relation relation = database.getRelation(TypeUtil.GUESSED_LABEL); + return run(relation); + } + catch(NoSupportedDataTypeException e) { + // Ignore. + } + final DBIDs ids = database.getRelation(TypeUtil.ANY).getDBIDs(); + Clustering result = new Clustering("All-in-one trivial Clustering", "allinone-clustering"); + Cluster c = new Cluster(ids, ClusterModel.CLUSTER); + result.addCluster(c); + return result; + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java index cd45cda2..90ca3625 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java @@ -35,7 +35,7 @@ import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.data.synthetic.bymodel.GeneratorInterface; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -102,14 +102,14 @@ public class ByModelClustering extends AbstractAlgorithm> impl public Clustering run(Relation relation) { // Build model mapping HashMap modelMap = new HashMap(); - for(DBID id : relation.iterDBIDs()) { - Model model = relation.get(id); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + Model model = relation.get(iditer); ModifiableDBIDs modelids = modelMap.get(model); if(modelids == null) { modelids = DBIDUtil.newHashSet(); modelMap.put(model, modelids); } - modelids.add(id); + modelids.add(iditer); } Clustering result = new Clustering("By Model Clustering", "bymodel-clustering"); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java index f0b31d32..88a62e38 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java @@ -38,6 +38,8 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; @@ -186,20 +188,21 @@ public class ABOD> extends AbstractDistanceBasedAlg assert (k == this.k); KNNQuery knnQuery = QueryUtil.getKNNQuery(relation, getDistanceFunction(), k); - for(DBID objKey : relation.iterDBIDs()) { - MeanVariance s = new MeanVariance(); + MeanVariance s = new MeanVariance(); + for(DBIDIter objKey = relation.iterDBIDs(); objKey.valid(); objKey.advance()) { + s.reset(); // System.out.println("Processing: " +objKey); KNNResult neighbors = knnQuery.getKNNForDBID(objKey, k); Iterator> iter = neighbors.iterator(); while(iter.hasNext()) { - DBID key1 = iter.next().getDBID(); + DistanceResultPair key1 = iter.next(); // Iterator iter2 = data.keyIterator(); Iterator> iter2 = neighbors.iterator(); // PriorityQueue best = new PriorityQueue(false, k); while(iter2.hasNext()) { - DBID key2 = iter2.next().getDBID(); - if(key2.equals(key1) || key1.equals(objKey) || key2.equals(objKey)) { + DistanceResultPair key2 = iter2.next(); + if(key2.sameDBID(key1) || key1.sameDBID(objKey) || key2.sameDBID(objKey)) { continue; } double nenner = calcDenominator(kernelMatrix, objKey, key1, key2); @@ -214,7 +217,7 @@ public class ABOD> extends AbstractDistanceBasedAlg } // Sample variance probably would be correct, however the numerical // instabilities can actually break ABOD here. - pq.add(new DoubleObjPair(s.getNaiveVariance(), objKey)); + pq.add(new DoubleObjPair(s.getNaiveVariance(), objKey.getDBID())); } DoubleMinMax minmaxabod = new DoubleMinMax(); @@ -238,16 +241,18 @@ public class ABOD> extends AbstractDistanceBasedAlg * @return result */ public OutlierResult getFastRanking(Relation relation, int k, int sampleSize) { + final DBIDs ids = relation.getDBIDs(); // Fix a static set of IDs - staticids = DBIDUtil.newArray(relation.getDBIDs()); + // TODO: add a DBIDUtil.ensureSorted? + staticids = DBIDUtil.newArray(ids); staticids.sort(); KernelMatrix kernelMatrix = new KernelMatrix(primitiveKernelFunction, relation, staticids); Heap> pq = new Heap>(relation.size(), Collections.reverseOrder()); // get Candidate Ranking - for(DBID aKey : relation.iterDBIDs()) { - HashMap dists = new HashMap(relation.size()); + for(DBIDIter aKey = relation.iterDBIDs(); aKey.valid(); aKey.advance()) { + WritableDoubleDataStore dists = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT); // determine kNearestNeighbors and pairwise distances Heap> nn; if(!useRNDSample) { @@ -259,7 +264,7 @@ public class ABOD> extends AbstractDistanceBasedAlg } // get normalization - double[] counter = calcFastNormalization(aKey, dists); + double[] counter = calcFastNormalization(aKey, dists, staticids); // System.out.println(counter[0] + " " + counter2[0] + " " + counter[1] + // " " + counter2[1]); // umsetzen von Pq zu list @@ -269,13 +274,14 @@ public class ABOD> extends AbstractDistanceBasedAlg } // getFilter double var = getAbofFilter(kernelMatrix, aKey, dists, counter[1], counter[0], neighbors); - pq.add(new DoubleObjPair(var, aKey)); + pq.add(new DoubleObjPair(var, aKey.getDBID())); // System.out.println("prog "+(prog++)); } // refine Candidates Heap> resqueue = new Heap>(k); // System.out.println(pq.size() + " objects ordered into candidate list."); // int v = 0; + MeanVariance s = new MeanVariance(); while(!pq.isEmpty()) { if(resqueue.size() == k && pq.peek().first > resqueue.peek().first) { break; @@ -290,13 +296,13 @@ public class ABOD> extends AbstractDistanceBasedAlg // + " worst result: " + Double.MAX_VALUE); // } // v++; - MeanVariance s = new MeanVariance(); - for(DBID bKey : relation.iterDBIDs()) { - if(bKey.equals(aKey)) { + s.reset(); + for(DBIDIter bKey = relation.iterDBIDs(); bKey.valid(); bKey.advance()) { + if(bKey.sameDBID(aKey)) { continue; } - for(DBID cKey : relation.iterDBIDs()) { - if(cKey.equals(aKey)) { + for(DBIDIter cKey = relation.iterDBIDs(); cKey.valid(); cKey.advance()) { + if(cKey.sameDBID(aKey)) { continue; } // double nenner = dists[y]*dists[z]; @@ -325,64 +331,60 @@ public class ABOD> extends AbstractDistanceBasedAlg } // System.out.println(v + " Punkte von " + data.size() + " verfeinert !!"); DoubleMinMax minmaxabod = new DoubleMinMax(); - WritableDoubleDataStore abodvalues = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); + WritableDoubleDataStore abodvalues = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC); for(DoubleObjPair pair : pq) { abodvalues.putDouble(pair.getSecond(), pair.first); minmaxabod.put(pair.first); } // Build result representation. - Relation scoreResult = new MaterializedRelation("Angle-based Outlier Detection", "abod-outlier", TypeUtil.DOUBLE, abodvalues, relation.getDBIDs()); + Relation scoreResult = new MaterializedRelation("Angle-based Outlier Detection", "abod-outlier", TypeUtil.DOUBLE, abodvalues, ids); OutlierScoreMeta scoreMeta = new InvertedOutlierScoreMeta(minmaxabod.getMin(), minmaxabod.getMax(), 0.0, Double.POSITIVE_INFINITY); return new OutlierResult(scoreMeta, scoreResult); } - private double[] calcFastNormalization(DBID x, HashMap dists) { + private double[] calcFastNormalization(DBIDRef x, WritableDoubleDataStore dists, DBIDs ids) { double[] result = new double[2]; double sum = 0; double sumF = 0; - for(DBID yKey : dists.keySet()) { - if(dists.get(yKey) != 0) { - double tmp = 1 / Math.sqrt(dists.get(yKey)); + for (DBIDIter yKey = ids.iter(); yKey.valid(); yKey.advance()) { + if(dists.doubleValue(yKey) != 0) { + double tmp = 1 / Math.sqrt(dists.doubleValue(yKey)); sum += tmp; - sumF += (1 / dists.get(yKey)) * tmp; + sumF += (1 / dists.doubleValue(yKey)) * tmp; } } double sofar = 0; double sofarF = 0; - for(DBID zKey : dists.keySet()) { - if(dists.get(zKey) != 0) { - double tmp = 1 / Math.sqrt(dists.get(zKey)); + for (DBIDIter zKey = ids.iter(); zKey.valid(); zKey.advance()) { + if(dists.doubleValue(zKey) != 0) { + double tmp = 1 / Math.sqrt(dists.doubleValue(zKey)); sofar += tmp; double rest = sum - sofar; result[0] += tmp * rest; - sofarF += (1 / dists.get(zKey)) * tmp; + sofarF += (1 / dists.doubleValue(zKey)) * tmp; double restF = sumF - sofarF; - result[1] += (1 / dists.get(zKey)) * tmp * restF; + result[1] += (1 / dists.doubleValue(zKey)) * tmp * restF; } } return result; } - private double getAbofFilter(KernelMatrix kernelMatrix, DBID aKey, HashMap dists, double fulCounter, double counter, DBIDs neighbors) { + private double getAbofFilter(KernelMatrix kernelMatrix, DBIDRef aKey, WritableDoubleDataStore dists, double fulCounter, double counter, DBIDs neighbors) { double sum = 0.0; double sqrSum = 0.0; double partCounter = 0; - Iterator iter = neighbors.iterator(); - while(iter.hasNext()) { - DBID bKey = iter.next(); - if(bKey.equals(aKey)) { + for(DBIDIter bKey = neighbors.iter(); bKey.valid(); bKey.advance()) { + if(bKey.sameDBID(aKey)) { continue; } - Iterator iter2 = neighbors.iterator(); - while(iter2.hasNext()) { - DBID cKey = iter2.next(); - if(cKey.equals(aKey)) { + for(DBIDIter cKey = neighbors.iter(); cKey.valid(); cKey.advance()) { + if(cKey.sameDBID(aKey)) { continue; } - if(bKey.compareTo(cKey) > 0) { - double nenner = dists.get(bKey).doubleValue() * dists.get(cKey).doubleValue(); + if(bKey.compareDBID(cKey) > 0) { + double nenner = dists.doubleValue(bKey) * dists.doubleValue(cKey); if(nenner != 0) { double tmp = calcNumerator(kernelMatrix, aKey, bKey, cKey) / nenner; double sqrtNenner = Math.sqrt(nenner); @@ -406,13 +408,13 @@ public class ABOD> extends AbstractDistanceBasedAlg * @param bKey * @return cosinus value */ - private double calcCos(KernelMatrix kernelMatrix, DBID aKey, DBID bKey) { + private double calcCos(KernelMatrix kernelMatrix, DBIDRef aKey, DBIDRef bKey) { final int ai = mapDBID(aKey); final int bi = mapDBID(bKey); return kernelMatrix.getDistance(ai, ai) + kernelMatrix.getDistance(bi, bi) - 2 * kernelMatrix.getDistance(ai, bi); } - private int mapDBID(DBID aKey) { + private int mapDBID(DBIDRef aKey) { // TODO: this is not the most efficient... int off = staticids.binarySearch(aKey); if(off < 0) { @@ -421,44 +423,44 @@ public class ABOD> extends AbstractDistanceBasedAlg return off + 1; } - private double calcDenominator(KernelMatrix kernelMatrix, DBID aKey, DBID bKey, DBID cKey) { + private double calcDenominator(KernelMatrix kernelMatrix, DBIDRef aKey, DBIDRef bKey, DBIDRef cKey) { return calcCos(kernelMatrix, aKey, bKey) * calcCos(kernelMatrix, aKey, cKey); } - private double calcNumerator(KernelMatrix kernelMatrix, DBID aKey, DBID bKey, DBID cKey) { + private double calcNumerator(KernelMatrix kernelMatrix, DBIDRef aKey, DBIDRef bKey, DBIDRef cKey) { final int ai = mapDBID(aKey); final int bi = mapDBID(bKey); final int ci = mapDBID(cKey); return (kernelMatrix.getDistance(ai, ai) + kernelMatrix.getDistance(bi, ci) - kernelMatrix.getDistance(ai, ci) - kernelMatrix.getDistance(ai, bi)); } - private Heap> calcDistsandNN(Relation data, KernelMatrix kernelMatrix, int sampleSize, DBID aKey, HashMap dists) { + private Heap> calcDistsandNN(Relation data, KernelMatrix kernelMatrix, int sampleSize, DBIDRef aKey, WritableDoubleDataStore dists) { Heap> nn = new Heap>(sampleSize); - for(DBID bKey : data.iterDBIDs()) { + for(DBIDIter bKey = data.iterDBIDs(); bKey.valid(); bKey.advance()) { double val = calcCos(kernelMatrix, aKey, bKey); - dists.put(bKey, val); + dists.putDouble(bKey, val); if(nn.size() < sampleSize) { - nn.add(new DoubleObjPair(val, bKey)); + nn.add(new DoubleObjPair(val, bKey.getDBID())); } else { if(val < nn.peek().first) { nn.remove(); - nn.add(new DoubleObjPair(val, bKey)); + nn.add(new DoubleObjPair(val, bKey.getDBID())); } } } return nn; } - private Heap> calcDistsandRNDSample(Relation data, KernelMatrix kernelMatrix, int sampleSize, DBID aKey, HashMap dists) { + private Heap> calcDistsandRNDSample(Relation data, KernelMatrix kernelMatrix, int sampleSize, DBIDRef aKey, WritableDoubleDataStore dists) { Heap> nn = new Heap>(sampleSize); int step = (int) ((double) data.size() / (double) sampleSize); int counter = 0; - for(DBID bKey : data.iterDBIDs()) { + for(DBIDIter bKey = data.iterDBIDs(); bKey.valid(); bKey.advance()) { double val = calcCos(kernelMatrix, aKey, bKey); - dists.put(bKey, val); + dists.putDouble(bKey, val); if(counter % step == 0) { - nn.add(new DoubleObjPair(val, bKey)); + nn.add(new DoubleObjPair(val, bKey.getDBID())); } counter++; } @@ -477,24 +479,21 @@ public class ABOD> extends AbstractDistanceBasedAlg Heap> pq = new Heap>(data.size(), Collections.reverseOrder()); HashMap explaintab = new HashMap(); // test all objects - for(DBID objKey : data.iterDBIDs()) { - MeanVariance s = new MeanVariance(); + MeanVariance s = new MeanVariance(), s2 = new MeanVariance(); + for(DBIDIter objKey = data.iterDBIDs(); objKey.valid(); objKey.advance()) { + s.reset(); // Queue for the best explanation Heap> explain = new Heap>(); // determine Object // for each pair of other objects - Iterator iter = data.iterDBIDs(); + for (DBIDIter key1 = data.iterDBIDs(); key1.valid(); key1.advance()) { // Collect Explanation Vectors - while(iter.hasNext()) { - MeanVariance s2 = new MeanVariance(); - DBID key1 = iter.next(); - Iterator iter2 = data.iterDBIDs(); - if(objKey.equals(key1)) { + s2.reset(); + if(objKey.sameDBID(key1)) { continue; } - while(iter2.hasNext()) { - DBID key2 = iter2.next(); - if(key2.equals(key1) || objKey.equals(key2)) { + for (DBIDIter key2 = data.iterDBIDs(); key2.valid(); key2.advance()) { + if(key2.sameDBID(key1) || objKey.sameDBID(key2)) { continue; } double nenner = calcDenominator(kernelMatrix, objKey, key1, key2); @@ -504,22 +503,22 @@ public class ABOD> extends AbstractDistanceBasedAlg s2.put(tmp, 1 / sqr); } } - explain.add(new DoubleObjPair(s2.getSampleVariance(), key1)); + explain.add(new DoubleObjPair(s2.getSampleVariance(), key1.getDBID())); s.put(s2); } // build variance of the observed vectors - pq.add(new DoubleObjPair(s.getSampleVariance(), objKey)); + pq.add(new DoubleObjPair(s.getSampleVariance(), objKey.getDBID())); // ModifiableDBIDs expList = DBIDUtil.newArray(); expList.add(explain.remove().getSecond()); while(!explain.isEmpty()) { DBID nextKey = explain.remove().getSecond(); - if(nextKey.equals(objKey)) { + if(nextKey.sameDBID(objKey)) { continue; } double max = Double.MIN_VALUE; - for(DBID exp : expList) { - if(exp.equals(objKey) || nextKey.equals(exp)) { + for(DBIDIter exp = expList.iter(); exp.valid(); exp.advance()) { + if(exp.sameDBID(objKey) || nextKey.sameDBID(exp)) { continue; } double nenner = Math.sqrt(calcCos(kernelMatrix, objKey, nextKey)) * Math.sqrt(calcCos(kernelMatrix, objKey, exp)); @@ -530,7 +529,7 @@ public class ABOD> extends AbstractDistanceBasedAlg expList.add(nextKey); } } - explaintab.put(objKey, expList); + explaintab.put(objKey.getDBID(), expList); } System.out.println("--------------------------------------------"); System.out.println("Result: ABOD"); @@ -552,10 +551,9 @@ public class ABOD> extends AbstractDistanceBasedAlg private void generateExplanation(Relation data, DBID key, DBIDs expList) { Vector vect1 = data.get(key).getColumnVector(); - Iterator iter = expList.iterator(); - while(iter.hasNext()) { + for(DBIDIter iter = expList.iter(); iter.valid(); iter.advance()) { System.out.println("Outlier: " + vect1); - Vector exp = data.get(iter.next()).getColumnVector(); + Vector exp = data.get(iter).getColumnVector(); System.out.println("Most common neighbor: " + exp); // determine difference Vector Vector vals = exp.minus(vect1); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ALOCI.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ALOCI.java new file mode 100644 index 00000000..39c3db60 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ALOCI.java @@ -0,0 +1,724 @@ +package de.lmu.ifi.dbs.elki.algorithm.outlier; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ + +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; +import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; +import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; + +/** + * Fast Outlier Detection Using the "approximate Local Correlation Integral". + * + * Outlier detection using multiple epsilon neighborhoods. + * + * Reference: + *

+ * S. Papadimitriou, H. Kitagawa, P. B. Gibbons and C. Faloutsos:
+ * LOCI: Fast Outlier Detection Using the Local Correlation Integral.
+ * In: Proc. 19th IEEE Int. Conf. on Data Engineering (ICDE '03), Bangalore, + * India, 2003. + *

+ * + * @author Jonathan von Brünken + * @author Erich Schubert + * + * @param Object type + * @param Distance type + */ +@Title("LOCI: Fast Outlier Detection Using the Local Correlation Integral") +@Description("Algorithm to compute outliers based on the Local Correlation Integral") +@Reference(authors = "S. Papadimitriou, H. Kitagawa, P. B. Gibbons, C. Faloutsos", title = "LOCI: Fast Outlier Detection Using the Local Correlation Integral", booktitle = "Proc. 19th IEEE Int. Conf. on Data Engineering (ICDE '03), Bangalore, India, 2003", url = "http://dx.doi.org/10.1109/ICDE.2003.1260802") +public class ALOCI, D extends NumberDistance> extends AbstractAlgorithm implements OutlierAlgorithm { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(ALOCI.class); + + /** + * Minimum size for a leaf. + */ + private int nmin; + + /** + * Alpha (level difference of sampling and counting neighborhoods) + */ + private int alpha; + + /** + * Number of trees to generate (forest size) + */ + private int g; + + /** + * Random generator + */ + private Random random; + + /** + * Distance function + */ + private NumberVectorDistanceFunction distFunc; + + /** + * Constructor. + * + * @param distanceFunction Distance function + * @param nmin Minimum neighborhood size + * @param alpha Alpha value + * @param g Number of grids to use + * @param seed Random generator seed. + */ + public ALOCI(NumberVectorDistanceFunction distanceFunction, int nmin, int alpha, int g, Long seed) { + super(); + this.distFunc = distanceFunction; + this.nmin = nmin; + this.alpha = alpha; + this.g = g; + this.random = (seed != null) ? new Random(seed) : new Random(0); + } + + public OutlierResult run(Database database, Relation relation) { + final int dim = DatabaseUtil.dimensionality(relation); + FiniteProgress progressPreproc = logger.isVerbose() ? new FiniteProgress("Build aLOCI quadtress", g, logger) : null; + + // Compute extend of dataset. + double[] min, max; + { + Pair hbbs = DatabaseUtil.computeMinMax(relation); + double maxd = 0; + min = new double[dim]; + max = new double[dim]; + for(int i = 0; i < dim; i++) { + min[i] = hbbs.first.doubleValue(i + 1); + max[i] = hbbs.second.doubleValue(i + 1); + maxd = Math.max(maxd, max[i] - min[i]); + } + // Enlarge bounding box to have equal lengths. + for(int i = 0; i < dim; i++) { + double diff = (maxd - (max[i] - min[i])) / 2; + min[i] -= diff; + max[i] += diff; + } + } + + List qts = new ArrayList(g); + + double[] nshift = new double[dim]; + ALOCIQuadTree qt = new ALOCIQuadTree(min, max, nshift, nmin, relation); + qts.add(qt); + if(progressPreproc != null) { + progressPreproc.incrementProcessed(logger); + } + /* + * create the remaining g-1 shifted QuadTrees. This not clearly described in + * the paper and therefore implemented in a way that achieves good results + * with the test data. + */ + for(int shift = 1; shift < g; shift++) { + double[] svec = new double[dim]; + for(int i = 0; i < dim; i++) { + svec[i] = random.nextDouble() * (max[i] - min[i]); + } + qt = new ALOCIQuadTree(min, max, svec, nmin, relation); + qts.add(qt); + if(progressPreproc != null) { + progressPreproc.incrementProcessed(logger); + } + } + if(progressPreproc != null) { + progressPreproc.ensureCompleted(logger); + } + + // aLOCI main loop: evaluate + FiniteProgress progressLOCI = logger.isVerbose() ? new FiniteProgress("Compute aLOCI scores", relation.size(), logger) : null; + WritableDoubleDataStore mdef_norm = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); + DoubleMinMax minmax = new DoubleMinMax(); + + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + final O obj = relation.get(iditer); + + double maxmdefnorm = 0; + // For each level + for(int l = 0;; l++) { + // Find the closest C_i + Node ci = null; + for(int i = 0; i < g; i++) { + Node ci2 = qts.get(i).findClosestNode(obj, l); + if(ci2.getLevel() != l) { + continue; + } + // TODO: always use manhattan? + if(ci == null || distFunc.distance(ci.getCenter(), obj).compareTo(distFunc.distance(ci2.getCenter(), obj)) > 0) { + ci = ci2; + } + } + // logger.debug("level:" + (ci != null ? ci.getLevel() : -1) +" l:"+l); + if(ci == null) { + break; // no matching tree for this level. + } + + // Find the closest C_j + Node cj = null; + for(int i = 0; i < g; i++) { + Node cj2 = qts.get(i).findClosestNode(ci.getCenter(), l - alpha); + // TODO: allow higher levels or not? + if(cj != null && cj2.getLevel() < cj.getLevel()) { + continue; + } + // TODO: always use manhattan? + if(cj == null || distFunc.distance(cj.getCenter(), ci.getCenter()).compareTo(distFunc.distance(cj2.getCenter(), ci.getCenter())) > 0) { + cj = cj2; + } + } + // logger.debug("level:" + (cj != null ? cj.getLevel() : -1) +" l:"+l); + if(cj == null) { + continue; // no matching tree for this level. + } + double mdefnorm = calculate_MDEF_norm(cj, ci); + // logger.warning("level:" + ci.getLevel() + "/" + cj.getLevel() + + // " mdef: " + mdefnorm); + maxmdefnorm = Math.max(maxmdefnorm, mdefnorm); + } + // Store results + mdef_norm.putDouble(iditer, maxmdefnorm); + minmax.put(maxmdefnorm); + if(progressLOCI != null) { + progressLOCI.incrementProcessed(logger); + } + } + if(progressLOCI != null) { + progressLOCI.ensureCompleted(logger); + } + Relation scoreResult = new MaterializedRelation("aLOCI normalized MDEF", "aloci-mdef-outlier", TypeUtil.DOUBLE, mdef_norm, relation.getDBIDs()); + OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY); + OutlierResult result = new OutlierResult(scoreMeta, scoreResult); + return result; + } + + /** + * Method for the MDEF calculation + * + * @param sn Sampling Neighborhood + * @param cg Counting Neighborhood + * + * @return MDEF norm + */ + private static double calculate_MDEF_norm(Node sn, Node cg) { + // get the square sum of the counting neighborhoods box counts + long sq = sn.getSquareSum(cg.getLevel() - sn.getLevel()); + /* + * if the square sum is equal to box count of the sampling Neighborhood then + * n_hat is equal one, and as cg needs to have at least one Element mdef + * would get zero or lower than zero. This is the case when all of the + * counting Neighborhoods contain one or zero Objects. Additionally, the + * cubic sum, square sum and sampling Neighborhood box count are all equal, + * which leads to sig_n_hat being zero and thus mdef_norm is either negative + * infinite or undefined. As the distribution of the Objects seem quite + * uniform, a mdef_norm value of zero ( = no outlier) is appropriate and + * circumvents the problem of undefined values. + */ + if(sq == sn.getCount()) { + return 0.0; + } + // calculation of mdef according to the paper and standardization as done in + // LOCI + long cb = sn.getCubicSum(cg.getLevel() - sn.getLevel()); + double n_hat = (double) sq / sn.getCount(); + double sig_n_hat = java.lang.Math.sqrt(cb * sn.getCount() - (sq * sq)) / sn.getCount(); + // Avoid NaN - correct result 0.0? + if(sig_n_hat < Double.MIN_NORMAL) { + return 0.0; + } + double mdef = n_hat - cg.getCount(); + return mdef / sig_n_hat; + } + + @Override + protected Logging getLogger() { + return logger; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(distFunc.getInputTypeRestriction()); + } + + /** + * Simple quadtree for ALOCI. Not storing the actual objects, just the counts. + * + * Furthermore, the quadtree can be shifted by a specified vector, wrapping + * around min/max + * + * @author Jonathan von Brünken + * @author Erich Schubert + * + * @apiviz.composedOf Node + */ + static class ALOCIQuadTree { + /** + * Tree parameters + */ + private double[] shift, min, width; + + /** + * Maximum fill for a page before splitting + */ + private int nmin; + + /** + * Tree root + */ + Node root; + + /** + * Relation indexed. + */ + private Relation> relation; + + /** + * Constructor. + * + * @param min Minimum coordinates + * @param max Maximum coordinates + * @param shift Tree shift offset + * @param nmin Maximum size for a page to split + * @param relation Relation to index + */ + public ALOCIQuadTree(double[] min, double[] max, double[] shift, int nmin, Relation> relation) { + super(); + assert (min.length <= 32) : "Quadtrees are only supported for up to 32 dimensions"; + this.shift = shift; + this.nmin = nmin; + this.min = min; + this.width = new double[min.length]; + for(int d = 0; d < min.length; d++) { + width[d] = max[d] - min[d]; + if(width[d] <= 0) { + width[d] = 1; + } + } + double[] center = new double[min.length]; + for(int d = 0; d < min.length; d++) { + if(shift[d] < width[d] * .5) { + center[d] = min[d] + shift[d] + width[d] * .5; + } + else { + center[d] = min[d] + shift[d] - width[d] * .5; + } + } + this.relation = relation; + ArrayModifiableDBIDs ids = DBIDUtil.newArray(relation.getDBIDs()); + List children = new ArrayList(); + bulkLoad(min.clone(), max.clone(), children, ids, 0, ids.size(), 0, 0, 0); + this.root = new Node(0, new Vector(center), ids.size(), -1, children); + } + + /** + * Bulk load the tree + * + * @param lmin Subtree minimum (unshifted, will be modified) + * @param lmax Subtree maximum (unshifted, will be modified) + * @param children List of children for current parent + * @param ids IDs to process + * @param start Start of ids subinterval + * @param end End of ids subinterval + * @param dim Current dimension + * @param level Current tree level + * @param code Bit code of node position + */ + private void bulkLoad(double[] lmin, double[] lmax, List children, ArrayModifiableDBIDs ids, int start, int end, int dim, int level, int code) { + // logger.warning(FormatUtil.format(lmin)+" "+FormatUtil.format(lmax)+" "+start+"->"+end+" "+(end-start)); + // Hack: Check degenerate cases that won't split + if(dim == 0) { + NumberVector first = relation.get(ids.get(start)); + boolean degenerate = true; + loop: for(int pos = start + 1; pos < end; pos++) { + NumberVector other = relation.get(ids.get(pos)); + for(int d = 1; d <= lmin.length; d++) { + if(Math.abs(first.doubleValue(d) - other.doubleValue(d)) > 1E-15) { + degenerate = false; + break loop; + } + } + } + if(degenerate) { + double[] center = new double[lmin.length]; + for(int d = 0; d < lmin.length; d++) { + center[d] = lmin[d] * .5 + lmax[d] * .5 + shift[d]; + if(center[d] > min[d] + width[d]) { + center[d] -= width[d]; + } + } + children.add(new Node(code, new Vector(center), end - start, level, null)); + return; + } + } + // Complete level + if(dim == lmin.length) { + double[] center = new double[lmin.length]; + for(int d = 0; d < lmin.length; d++) { + center[d] = lmin[d] * .5 + lmax[d] * .5 + shift[d]; + if(center[d] > min[d] + width[d]) { + center[d] -= width[d]; + } + } + if(end - start < nmin) { + children.add(new Node(code, new Vector(center), end - start, level, null)); + return; + } + else { + List newchildren = new ArrayList(); + bulkLoad(lmin, lmax, newchildren, ids, start, end, 0, level + 1, 0); + children.add(new Node(code, new Vector(center), end - start, level, newchildren)); + return; + } + } + else { + // Partially sort data, by dimension dim < mid + int spos = start, epos = end; + while(spos < epos) { + if(getShiftedDim(relation.get(ids.get(spos)), dim, level) <= .5) { + spos++; + continue; + } + if(getShiftedDim(relation.get(ids.get(epos - 1)), dim, level) > 0.5) { + epos--; + continue; + } + ids.swap(spos, epos - 1); + spos++; + epos--; + } + if(start < spos) { + final double tmp = lmax[dim]; + lmax[dim] = lmax[dim] * .5 + lmin[dim] * .5; + bulkLoad(lmin, lmax, children, ids, start, spos, dim + 1, level, code); + lmax[dim] = tmp; // Restore + } + if(spos < end) { + final double tmp = lmin[dim]; + lmin[dim] = lmax[dim] * .5 + lmin[dim] * .5; + bulkLoad(lmin, lmax, children, ids, spos, end, dim + 1, level, code | (1 << dim)); + lmin[dim] = tmp; // Restore + } + } + } + + /** + * Shift and wrap a single dimension. + * + * @param obj Object + * @param dim Dimension + * @param level Level (controls scaling/wraping!) + * @return Shifted position + */ + private double getShiftedDim(NumberVector obj, int dim, int level) { + double pos = obj.doubleValue(dim + 1) + shift[dim]; + pos = (pos - min[dim]) / width[dim] * (1 + level); + return pos - Math.floor(pos); + } + + /** + * Find the closest node (of depth tlevel or above, if there is no node at + * this depth) for the given vector. + * + * @param vec Query vector + * @param tlevel Target level + * @return Node + */ + public Node findClosestNode(NumberVector vec, int tlevel) { + Node cur = root; + for(int level = 0; level <= tlevel; level++) { + if(cur.children == null) { + break; + } + int code = 0; + for(int d = 0; d < min.length; d++) { + if(getShiftedDim(vec, d, level) > .5) { + code |= 1 << d; + } + } + boolean found = false; + for(Node child : cur.children) { + if(child.code == code) { + cur = child; + found = true; + break; + } + } + if(!found) { + break; // Do not descend + } + } + return cur; + } + } + + /** + * Node of the ALOCI Quadtree + * + * @author Erich Schubert + */ + static class Node { + /** + * Position code + */ + final int code; + + /** + * Number of elements + */ + final int count; + + /** + * Level of node + */ + final int level; + + /** + * Child nodes, may be null + */ + List children; + + /** + * Parent node + */ + Node parent = null; + + /** + * Center vector + */ + Vector center; + + /** + * Constructor. + * + * @param code Node code + * @param center Center vector + * @param count Element count + * @param level Node level + * @param children Children list + */ + protected Node(int code, Vector center, int count, int level, List children) { + this.code = code; + this.center = center; + this.count = count; + this.level = level; + this.children = children; + if(children != null) { + for(Node child : children) { + child.parent = this; + } + } + } + + /** + * Get level of node. + * + * @return Level of node + */ + public int getLevel() { + return level; + } + + /** + * Get count of subtree + * + * @return subtree count + */ + public int getCount() { + return count; + } + + /** + * Return center vector + * + * @return center vector + */ + public Vector getCenter() { + return center; + } + + /** + * Get sum of squares, recursively + * + * @param levels Depth to collect + * @return Sum of squares + */ + public long getSquareSum(int levels) { + if(levels <= 0 || children == null) { + return ((long) count) * ((long) count); + } + long agg = 0; + for(Node child : children) { + agg += child.getSquareSum(levels - 1); + } + return agg; + } + + /** + * Get cubic sum. + * + * @param levels Level to collect + * @return sum of cubes + */ + public long getCubicSum(int levels) { + if(levels <= 0 || children == null) { + return ((long) count) * ((long) count) * ((long) count); + } + long agg = 0; + for(Node child : children) { + agg += child.getCubicSum(levels - 1); + } + return agg; + } + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer, D extends NumberDistance> extends AbstractParameterizer { + /** + * Parameter to specify the minimum neighborhood size + */ + public static final OptionID NMIN_ID = OptionID.getOrCreateOptionID("loci.nmin", "Minimum neighborhood size to be considered."); + + /** + * Parameter to specify the averaging neighborhood scaling. + */ + public static final OptionID ALPHA_ID = OptionID.getOrCreateOptionID("loci.alpha", "Scaling factor for averaging neighborhood"); + + /** + * Parameter to specify the number of Grids to use. + */ + public static final OptionID GRIDS_ID = OptionID.getOrCreateOptionID("loci.g", "The number of Grids to use."); + + /** + * Parameter to specify the seed to initialize Random. + */ + public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("loci.seed", "The seed to use for initializing Random."); + + /** + * Neighborhood minimum size + */ + protected int nmin = 0; + + /** + * Alpha: number of levels difference to use in comparison + */ + protected int alpha = 4; + + /** + * G: number of shifted trees to create. + */ + protected int g = 1; + + /** + * Random generator seed + */ + protected Long seed = null; + + /** + * The distance function + */ + private NumberVectorDistanceFunction distanceFunction; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + + ObjectParameter> distanceFunctionP = makeParameterDistanceFunction(EuclideanDistanceFunction.class, NumberVectorDistanceFunction.class); + if(config.grab(distanceFunctionP)) { + distanceFunction = distanceFunctionP.instantiateClass(config); + } + + final IntParameter nminP = new IntParameter(NMIN_ID, 20); + if(config.grab(nminP)) { + nmin = nminP.getValue(); + } + + final IntParameter g = new IntParameter(GRIDS_ID, 1); + if(config.grab(g)) { + this.g = g.getValue(); + } + + final LongParameter seedP = new LongParameter(SEED_ID, true); + if(config.grab(seedP)) { + this.seed = seedP.getValue(); + } + + final IntParameter alphaP = new IntParameter(ALPHA_ID, 4); + if(config.grab(alphaP)) { + this.alpha = alphaP.getValue(); + if(this.alpha < 1) { + this.alpha = 1; + } + } + } + + @Override + protected ALOCI makeInstance() { + return new ALOCI(distanceFunction, nmin, alpha, g, seed); + } + } +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractAggarwalYuOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractAggarwalYuOutlier.java index 994ce8e2..9c1a216a 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractAggarwalYuOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractAggarwalYuOutlier.java @@ -33,6 +33,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; @@ -45,7 +46,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; -import de.lmu.ifi.dbs.elki.utilities.pairs.FCPair; +import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleObjPair; import de.lmu.ifi.dbs.elki.utilities.pairs.IntIntPair; /** @@ -121,22 +122,23 @@ public abstract class AbstractAggarwalYuOutlier> ex final ArrayList> ranges = new ArrayList>(); // Temporary projection storage of the database - final ArrayList>> dbAxis = new ArrayList>>(dim); + final ArrayList>> dbAxis = new ArrayList>>(dim); for(int i = 0; i < dim; i++) { - ArrayList> axis = new ArrayList>(size); + ArrayList> axis = new ArrayList>(size); dbAxis.add(i, axis); } // Project - for(DBID id : allids) { + for(DBIDIter iter = allids.iter(); iter.valid(); iter.advance()) { + DBID id = iter.getDBID(); final V obj = database.get(id); for(int d = 1; d <= dim; d++) { - dbAxis.get(d - 1).add(new FCPair(obj.doubleValue(d), id)); + dbAxis.get(d - 1).add(new DoubleObjPair(obj.doubleValue(d), id)); } } // Split into cells final double part = size * 1.0 / phi; for(int d = 1; d <= dim; d++) { - ArrayList> axis = dbAxis.get(d - 1); + ArrayList> axis = dbAxis.get(d - 1); Collections.sort(axis); ArrayList dimranges = new ArrayList(phi + 1); dimranges.add(allids); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractDBOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractDBOutlier.java index 1d77af3a..a5ccce3a 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractDBOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractDBOutlier.java @@ -77,8 +77,11 @@ public abstract class AbstractDBOutlier> extends Abstra /** * Runs the algorithm in the timed evaluation part. * + * @param database Database to process + * @param relation Relation to process + * @return Outlier result */ - public OutlierResult run(Database database, Relation relation) throws IllegalStateException { + public OutlierResult run(Database database, Relation relation) { // Run the actual score process DataStore dbodscore = computeOutlierScores(database, relation, d); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuEvolutionary.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuEvolutionary.java index 5d357744..1d02e865 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuEvolutionary.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuEvolutionary.java @@ -38,6 +38,7 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -139,9 +140,8 @@ public class AggarwalYuEvolutionary> extends Abstra * @param database Database * @param relation Relation * @return Result - * @throws IllegalStateException */ - public OutlierResult run(Database database, Relation relation) throws IllegalStateException { + public OutlierResult run(Database database, Relation relation) { final int dbsize = relation.size(); ArrayList> ranges = buildRanges(relation); @@ -151,7 +151,8 @@ public class AggarwalYuEvolutionary> extends Abstra for(Individuum ind : individuums) { DBIDs ids = computeSubspaceForGene(ind.getGene(), ranges); double sparsityC = sparsity(ids.size(), dbsize, k); - for(DBID id : ids) { + for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + DBID id = iter.getDBID(); double prev = outlierScore.doubleValue(id); if(Double.isNaN(prev) || sparsityC < prev) { outlierScore.putDouble(id, sparsityC); @@ -160,7 +161,8 @@ public class AggarwalYuEvolutionary> extends Abstra } DoubleMinMax minmax = new DoubleMinMax(); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); double val = outlierScore.doubleValue(id); if(Double.isNaN(val)) { outlierScore.putDouble(id, 0.0); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuNaive.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuNaive.java index 190211c3..0bb73aba 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuNaive.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuNaive.java @@ -31,7 +31,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -147,19 +147,19 @@ public class AggarwalYuNaive> extends AbstractAggar final double sparsityC = sparsity(ids.size(), size, k); if(sparsityC < 0) { - for(DBID id : ids) { - double prev = sparsity.doubleValue(id); + for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + double prev = sparsity.doubleValue(iter); if(Double.isNaN(prev) || sparsityC < prev) { - sparsity.putDouble(id, sparsityC); + sparsity.putDouble(iter, sparsityC); } } } } DoubleMinMax minmax = new DoubleMinMax(); - for(DBID id : relation.iterDBIDs()) { - double val = sparsity.doubleValue(id); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + double val = sparsity.doubleValue(iditer); if(Double.isNaN(val)) { - sparsity.putDouble(id, 0.0); + sparsity.putDouble(iditer, 0.0); val = 0.0; } minmax.put(val); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/DBOutlierDetection.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/DBOutlierDetection.java index f4b0ba35..dbaf8a5a 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/DBOutlierDetection.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/DBOutlierDetection.java @@ -23,14 +23,12 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier; along with this program. If not, see . */ -import java.util.Iterator; - import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStore; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; @@ -117,19 +115,19 @@ public class DBOutlierDetection> extends AbstractDBOutl // if index exists, kNN query. if the distance to the mth nearest neighbor // is more than d -> object is outlier if(knnQuery != null) { - for(DBID id : distFunc.getRelation().iterDBIDs()) { + for(DBIDIter iditer = distFunc.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { counter++; - final KNNResult knns = knnQuery.getKNNForDBID(id, m); + final KNNResult knns = knnQuery.getKNNForDBID(iditer, m); if(logger.isDebugging()) { logger.debugFine("distance to mth nearest neighbour" + knns.toString()); } if(knns.get(Math.min(m, knns.size()) - 1).getDistance().compareTo(neighborhoodSize) <= 0) { // flag as outlier - scores.putDouble(id, 1.0); + scores.putDouble(iditer, 1.0); } else { // flag as no outlier - scores.putDouble(id, 0.0); + scores.putDouble(iditer, 0.0); } } if(progressOFlags != null) { @@ -138,27 +136,16 @@ public class DBOutlierDetection> extends AbstractDBOutl } else { // range query for each object. stop if m objects are found - for(DBID id : distFunc.getRelation().iterDBIDs()) { + for(DBIDIter iditer = distFunc.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { counter++; - Iterator iterator = distFunc.getRelation().iterDBIDs(); int count = 0; - while(iterator.hasNext() && count < m) { - DBID currentID = iterator.next(); - D currentDistance = distFunc.distance(id, currentID); - + for (DBIDIter iterator = distFunc.getRelation().iterDBIDs(); iterator.valid() && count < m; iterator.advance()) { + D currentDistance = distFunc.distance(iditer, iterator); if(currentDistance.compareTo(neighborhoodSize) <= 0) { count++; } } - - if(count < m) { - // flag as outlier - scores.putDouble(id, 1.0); - } - else { - // flag as no outlier - scores.putDouble(id, 0.0); - } + scores.putDouble(iditer, (count < m) ? 1.0 : 0); } if(progressOFlags != null) { diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/DBOutlierScore.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/DBOutlierScore.java index ec83a2a2..419b9a0e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/DBOutlierScore.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/DBOutlierScore.java @@ -28,7 +28,7 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStore; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -80,10 +80,10 @@ public class DBOutlierScore> extends AbstractDBOutlier< WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(distFunc.getRelation().getDBIDs(), DataStoreFactory.HINT_STATIC); // TODO: use bulk when implemented. - for(DBID id : distFunc.getRelation().iterDBIDs()) { + for(DBIDIter iditer = distFunc.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { // compute percentage of neighbors in the given neighborhood with size d - double n = (rangeQuery.getRangeForDBID(id, d).size()) / size; - scores.putDouble(id, 1.0 - n); + double n = (rangeQuery.getRangeForDBID(iditer, d).size()) / size; + scores.putDouble(iditer, 1.0 - n); } return scores; } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/EMOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/EMOutlier.java index 92d92036..db4b7782 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/EMOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/EMOutlier.java @@ -1,26 +1,27 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier; -/* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures - -Copyright (C) 2012 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see . -*/ + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.EM; @@ -33,7 +34,7 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.logging.Logging; @@ -84,19 +85,23 @@ public class EMOutlier> extends AbstractAlgorithm relation) throws IllegalStateException { + public OutlierResult run(Database database, Relation relation) { Clustering> emresult = emClustering.run(database, relation); double globmax = 0.0; WritableDoubleDataStore emo_score = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double maxProb = Double.POSITIVE_INFINITY; - double[] probs = emClustering.getProbClusterIGivenX(id); + double[] probs = emClustering.getProbClusterIGivenX(iditer); for(double prob : probs) { maxProb = Math.min(1 - prob, maxProb); } - emo_score.putDouble(id, maxProb); + emo_score.putDouble(iditer, maxProb); globmax = Math.max(maxProb, globmax); } Relation scoreres = new MaterializedRelation("EM outlier scores", "em-outlier", TypeUtil.DOUBLE, emo_score, relation.getDBIDs()); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianModel.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianModel.java index ae47c100..51833c8b 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianModel.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianModel.java @@ -30,6 +30,7 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.logging.Logging; @@ -92,7 +93,13 @@ public class GaussianModel> extends AbstractAlgorit this.invert = invert; } - public OutlierResult run(Relation relation) throws IllegalStateException { + /** + * Run the algorithm + * + * @param relation Data relation + * @return Outlier result + */ + public OutlierResult run(Relation relation) { DoubleMinMax mm = new DoubleMinMax(); // resulting scores WritableDoubleDataStore oscores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT); @@ -109,20 +116,21 @@ public class GaussianModel> extends AbstractAlgorit final double fakt = (1.0 / (Math.sqrt(Math.pow(MathUtil.TWOPI, DatabaseUtil.dimensionality(relation)) * covarianceMatrix.det()))); // for each object compute Mahalanobis distance - for(DBID id : relation.iterDBIDs()) { - Vector x = relation.get(id).getColumnVector().minusEquals(mean); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + Vector x = relation.get(iditer).getColumnVector().minusEquals(mean); // Gaussian PDF final double mDist = x.transposeTimesTimes(covarianceTransposed, x); final double prob = fakt * Math.exp(-mDist / 2.0); mm.put(prob); - oscores.putDouble(id, prob); + oscores.putDouble(iditer, prob); } final OutlierScoreMeta meta; if(invert) { double max = mm.getMax() != 0 ? mm.getMax() : 1.; - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); oscores.putDouble(id, (max - oscores.doubleValue(id)) / max); } meta = new BasicOutlierScoreMeta(0.0, 1.0); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianUniformMixture.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianUniformMixture.java index aa352582..1cd31442 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianUniformMixture.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianUniformMixture.java @@ -33,6 +33,7 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.generic.MaskedDBIDs; @@ -41,6 +42,7 @@ import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; import de.lmu.ifi.dbs.elki.math.MathUtil; +import de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix; import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta; @@ -127,7 +129,13 @@ public class GaussianUniformMixture> extends Abstra this.c = c; } - public OutlierResult run(Relation relation) throws IllegalStateException { + /** + * Run the algorithm + * + * @param relation Data relation + * @return Outlier result + */ + public OutlierResult run(Relation relation) { // Use an array list of object IDs for fast random access by an offset ArrayDBIDs objids = DBIDUtil.ensureArray(relation.getDBIDs()); // A bit set to flag objects as anomalous, none at the beginning @@ -205,9 +213,9 @@ public class GaussianUniformMixture> extends Abstra if(objids.isEmpty()) { return 0; } - double prob = 0; - Vector mean = DatabaseUtil.centroid(database, objids).getColumnVector(); - Matrix covarianceMatrix = DatabaseUtil.covarianceMatrix(database, objids); + CovarianceMatrix builder = CovarianceMatrix.make(database, objids); + Vector mean = builder.getMeanVector(); + Matrix covarianceMatrix = builder.destroyToSampleMatrix(); // test singulaere matrix Matrix covInv = covarianceMatrix.cheatToAvoidSingularity(SINGULARITY_CHEAT).inverse(); @@ -215,8 +223,9 @@ public class GaussianUniformMixture> extends Abstra double covarianceDet = covarianceMatrix.det(); double fakt = 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, DatabaseUtil.dimensionality(database)) * covarianceDet); // for each object compute probability and sum - for(DBID id : objids) { - Vector x = database.get(id).getColumnVector().minusEquals(mean); + double prob = 0; + for (DBIDIter iter = objids.iter(); iter.valid(); iter.advance()) { + Vector x = database.get(iter).getColumnVector().minusEquals(mean); double mDist = x.transposeTimesTimes(covInv, x); prob += Math.log(fakt * Math.exp(-mDist / 2.0)); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/HilOut.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/HilOut.java new file mode 100644 index 00000000..4ed56e1a --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/HilOut.java @@ -0,0 +1,988 @@ +package de.lmu.ifi.dbs.elki.algorithm.outlier; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ + +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.Set; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.query.DoubleDistanceResultPair; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.LPNormDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.math.spacefillingcurves.HilbertSpatialSorter; +import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; +import de.lmu.ifi.dbs.elki.utilities.BitsUtil; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.Heap; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.EnumParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; +import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; + +/** + * Fast Outlier Detection in High Dimensional Spaces + * + * Outlier Detection using Hilbert space filling curves + * + * Reference: + *

+ * F. Angiulli, C. Pizzuti:
+ * Fast Outlier Detection in High Dimensional Spaces.
+ * In: Proc. European Conference on Principles of Knowledge Discovery and Data + * Mining (PKDD'02), Helsinki, Finland, 2002. + *

+ * + * @author Jonathan von Brünken + * @author Erich Schubert + * + * @apiviz.composedOf HilbertFeatures + * @apiviz.uses HilFeature + * + * @param Object type + */ +@Title("Fast Outlier Detection in High Dimensional Spaces") +@Description("Algorithm to compute outliers using Hilbert space filling curves") +@Reference(authors = "F. Angiulli, C. Pizzuti", title = "Fast Outlier Detection in High Dimensional Spaces", booktitle = "Proc. European Conference on Principles of Knowledge Discovery and Data Mining (PKDD'02)", url = "http://dx.doi.org/10.1145/375663.375668") +public class HilOut> extends AbstractDistanceBasedAlgorithm implements OutlierAlgorithm { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(HilOut.class); + + /** + * Number of nearest neighbors + */ + private int k; + + /** + * Number of outliers to compute exactly + */ + private int n; + + /** + * Hilbert precision + */ + private int h; + + /** + * LPNorm p parameter + */ + private double t; + + /** + * Reporting mode: exact (top n) only, or all + */ + private Enum tn; + + /** + * Distance query + */ + private DistanceQuery distq; + + /** + * Set sizes, total and current iteration + */ + private int capital_n, n_star, capital_n_star, d; + + /** + * Outlier threshold + */ + private double omega_star; + + /** + * Type of output: all scores (upper bounds) or top n only + * + * @author Jonathan von Brünken + * + * @apiviz.exclude + */ + public static enum ScoreType { + All, TopN + } + + /** + * Constructor. + * + * @param k Number of Next Neighbors + * @param n Number of Outlier + * @param h Number of Bits for precision to use - max 32 + * @param tn TopN or All Outlier Rank to return + */ + protected HilOut(LPNormDistanceFunction distfunc, int k, int n, int h, Enum tn) { + super(distfunc); + this.n = n; + // HilOut does not count the object itself. We do in KNNWeightOutlier. + this.k = k - 1; + this.h = h; + this.tn = tn; + this.t = distfunc.getP(); + this.n_star = 0; + this.omega_star = 0.0; + } + + public OutlierResult run(Database database, Relation relation) { + distq = database.getDistanceQuery(relation, getDistanceFunction()); + d = DatabaseUtil.dimensionality(relation); + WritableDoubleDataStore hilout_weight = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); + + // Compute extend of dataset. + double[] min; + double diameter = 0; // Actually "length of edge" + { + Pair hbbs = DatabaseUtil.computeMinMax(relation); + min = new double[d]; + double[] max = new double[d]; + for(int i = 0; i < d; i++) { + min[i] = hbbs.first.doubleValue(i + 1); + max[i] = hbbs.second.doubleValue(i + 1); + diameter = Math.max(diameter, max[i] - min[i]); + } + // Enlarge bounding box to have equal lengths. + for(int i = 0; i < d; i++) { + double diff = (diameter - (max[i] - min[i])) / 2; + min[i] -= diff; + max[i] += diff; + } + if(logger.isVerbose()) { + logger.verbose("Rescaling dataset by " + (1 / diameter) + " to fit the unit cube."); + } + } + + // Initialization part + capital_n_star = capital_n = relation.size(); + HilbertFeatures h = new HilbertFeatures(relation, min, diameter); + + FiniteProgress progressHilOut = logger.isVerbose() ? new FiniteProgress("HilOut iterations", d + 1, logger) : null; + FiniteProgress progressTrueOut = logger.isVerbose() ? new FiniteProgress("True outliers found", n, logger) : null; + // Main part: 1. Phase max. d+1 loops + for(int j = 0; j <= d && n_star < n; j++) { + // initialize (clear) out and wlb - not 100% clear in the paper + h.out.clear(); + h.wlb.clear(); + // Initialize Hilbert values in pf according to current shift + h.initialize(.5 * j / (d + 1)); + // scan the Data according to the current shift; build out and wlb + scan(h, (int) (k * capital_n / (double) capital_n_star)); + // determine the true outliers (n_star) + trueOutliers(h); + if(progressTrueOut != null) { + progressTrueOut.setProcessed(n_star, logger); + } + // Build the top Set as out + wlb + h.top.clear(); + HashSetModifiableDBIDs top_keys = DBIDUtil.newHashSet(h.out.size()); + for(HilFeature entry : h.out) { + top_keys.add(entry.id); + h.top.add(entry); + } + for(HilFeature entry : h.wlb) { + if(!top_keys.contains(entry.id)) { + // No need to update top_keys - discarded + h.top.add(entry); + } + } + if(progressHilOut != null) { + progressHilOut.incrementProcessed(logger); + } + } + // 2. Phase: Additional Scan if less than n true outliers determined + if(n_star < n) { + h.out.clear(); + h.wlb.clear(); + // TODO: reinitialize shift to 0? + scan(h, capital_n); + } + if(progressHilOut != null) { + progressHilOut.setProcessed(d, logger); + progressHilOut.ensureCompleted(logger); + } + if(progressTrueOut != null) { + progressTrueOut.setProcessed(n, logger); + progressTrueOut.ensureCompleted(logger); + } + DoubleMinMax minmax = new DoubleMinMax(); + // Return weights in out + if(tn == ScoreType.TopN) { + minmax.put(0.0); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + hilout_weight.putDouble(iditer, 0.0); + } + for(HilFeature ent : h.out) { + minmax.put(ent.ubound); + hilout_weight.putDouble(ent.id, ent.ubound); + } + } + // Return all weights in pf + else { + for(HilFeature ent : h.pf) { + minmax.put(ent.ubound); + hilout_weight.putDouble(ent.id, ent.ubound); + } + } + Relation scoreResult = new MaterializedRelation("HilOut weight", "hilout-weight", TypeUtil.DOUBLE, hilout_weight, relation.getDBIDs()); + OutlierScoreMeta scoreMeta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY); + OutlierResult result = new OutlierResult(scoreMeta, scoreResult); + return result; + } + + /** + * Scan function performs a squential scan over the data. + * + * @param hf the hilbert features + * @param k0 + */ + private void scan(HilbertFeatures hf, int k0) { + final int mink0 = Math.min(2 * k0, capital_n - 1); + if(logger.isDebuggingFine()) { + logger.debugFine("Scanning with k0=" + k0 + " (" + mink0 + ")" + " N*=" + capital_n_star); + } + for(int i = 0; i < hf.pf.length; i++) { + if(hf.pf[i].ubound < omega_star) { + continue; + } + if(hf.pf[i].lbound < hf.pf[i].ubound) { + double omega = hf.fastUpperBound(i); + if(omega < omega_star) { + hf.pf[i].ubound = omega; + } + else { + int maxcount; + // capital_n-1 instead of capital_n: all, except self + if(hf.top.contains(hf.pf[i])) { + maxcount = capital_n - 1; + } + else { + maxcount = mink0; + } + innerScan(hf, i, maxcount); + } + } + if(hf.pf[i].ubound > 0) { + hf.updateOUT(i); + } + if(hf.pf[i].lbound > 0) { + hf.updateWLB(i); + } + if(hf.wlb.size() >= n) { + omega_star = Math.max(omega_star, hf.wlb.peek().lbound); + } + } + } + + /** + * innerScan function calculates new upper and lower bounds and inserts the + * points of the neighborhood the bounds are based on in the NN Set + * + * @param i position in pf of the feature for which the bounds should be + * calculated + * @param maxcount maximal size of the neighborhood + */ + private void innerScan(HilbertFeatures hf, final int i, final int maxcount) { + final O p = hf.relation.get(hf.pf[i].id); // Get only once for performance + int a = i, b = i; + int level = h, levela = h, levelb = h; + // Explore up to "maxcount" neighbors in this pass + for(int count = 0; count < maxcount; count++) { + final int c; // Neighbor to explore + if(a == 0) { // At left end, explore right + // assert (b < capital_n - 1); + levelb = Math.min(levelb, hf.pf[b].level); + b++; + c = b; + } + else if(b >= capital_n - 1) { // At right end, explore left + // assert (a > 0); + a--; + levela = Math.min(levela, hf.pf[a].level); + c = a; + } + else if(hf.pf[a - 1].level >= hf.pf[b].level) { // Prefer higher level + a--; + levela = Math.min(levela, hf.pf[a].level); + c = a; + } + else { + // assert (b < capital_n - 1); + levelb = Math.min(levelb, hf.pf[b].level); + b++; + c = b; + } + if(!hf.pf[i].nn_keys.contains(hf.pf[c].id)) { + // hf.distcomp ++; + hf.pf[i].insert(hf.pf[c].id, distq.distance(p, hf.pf[c].id).doubleValue(), k); + if(hf.pf[i].nn.size() == k) { + if(hf.pf[i].sum_nn < omega_star) { + break; // stop = true + } + final int mlevel = Math.max(levela, levelb); + if(mlevel < level) { + level = mlevel; + final double delta = hf.minDistLevel(hf.pf[i].id, level); + if(delta >= hf.pf[i].nn.peek().getDoubleDistance()) { + break; // stop = true + } + } + } + } + } + double br = hf.boxRadius(i, a - 1, b + 1); + double newlb = 0.0; + double newub = 0.0; + for(DoubleDistanceResultPair entry : hf.pf[i].nn) { + newub += entry.getDoubleDistance(); + if(entry.getDoubleDistance() <= br) { + newlb += entry.getDoubleDistance(); + } + } + if(newlb > hf.pf[i].lbound) { + hf.pf[i].lbound = newlb; + } + if(newub < hf.pf[i].ubound) { + hf.pf[i].ubound = newub; + } + } + + /** + * trueOutliers function updates n_star + * + * @param h the HilberFeatures + * + */ + + private void trueOutliers(HilbertFeatures h) { + n_star = 0; + for(HilFeature entry : h.out) { + if(entry.ubound >= omega_star && (entry.ubound - entry.lbound < 1E-10)) { + n_star++; + } + } + } + + @Override + protected Logging getLogger() { + return logger; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(new LPNormDistanceFunction(t).getInputTypeRestriction()); + } + + /** + * Class organizing the data points along a hilbert curve. + * + * @author Jonathan von Brünken + * + * @apiviz.composedOf HilFeature + */ + class HilbertFeatures { + // public int distcomp = 1; + + /** + * Relation indexed + */ + Relation relation; + + /** + * Hilbert representation ("point features") + */ + HilFeature[] pf; + + /** + * Data space minimums + */ + double[] min; + + /** + * Data space diameter + */ + double diameter; + + /** + * Current curve shift + */ + double shift; + + /** + * Top candidates + */ + private Set top; + + /** + * "OUT" + */ + private Heap out; + + /** + * "WLB" + */ + private Heap wlb; + + /** + * Constructor. + * + * @param relation Relation to index + * @param min Minimums for data space + * @param diameter Diameter of data space + */ + public HilbertFeatures(Relation relation, double[] min, double diameter) { + super(); + this.relation = relation; + this.min = min; + this.diameter = diameter; + this.pf = new HilFeature[relation.size()]; + + int pos = 0; + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + pf[pos++] = new HilFeature(iditer.getDBID(), new Heap(k, Collections.reverseOrder())); + } + this.out = new Heap(n, new Comparator() { + @Override + public int compare(HilFeature o1, HilFeature o2) { + return Double.compare(o1.ubound, o2.ubound); + } + }); + this.wlb = new Heap(n, new Comparator() { + @Override + public int compare(HilFeature o1, HilFeature o2) { + return Double.compare(o1.lbound, o2.lbound); + } + }); + this.top = new HashSet(2 * n); + } + + /** + * Hilbert function to fill pf with shifted Hilbert values. Also calculates + * the number current Outlier candidates capital_n_star + * + * @param shift the new shift factor + */ + private void initialize(double shift) { + this.shift = shift; + // FIXME: 64 bit mode untested - sign bit is tricky to handle correctly + // with the rescaling. 63 bit should be fine. The sign bit probably needs + // to be handled differently, or at least needs careful testing of the API + if(h >= 32) { // 32 to 63 bit + final long scale = Long.MAX_VALUE; // = 63 bits + for(int i = 0; i < pf.length; i++) { + NumberVector obj = relation.get(pf[i].id); + long[] coord = new long[d]; + for(int dim = 0; dim < d; dim++) { + coord[dim] = (long) (getDimForObject(obj, dim) * .5 * scale); + } + pf[i].hilbert = HilbertSpatialSorter.coordinatesToHilbert(coord, h, 1); + } + } + else if(h >= 16) { // 16-31 bit + final int scale = ~1 >>> 1; + for(int i = 0; i < pf.length; i++) { + NumberVector obj = relation.get(pf[i].id); + int[] coord = new int[d]; + for(int dim = 0; dim < d; dim++) { + coord[dim] = (int) (getDimForObject(obj, dim) * .5 * scale); + } + pf[i].hilbert = HilbertSpatialSorter.coordinatesToHilbert(coord, h, 1); + } + } + else if(h >= 8) { // 8-15 bit + final int scale = ~1 >>> 16; + for(int i = 0; i < pf.length; i++) { + NumberVector obj = relation.get(pf[i].id); + short[] coord = new short[d]; + for(int dim = 0; dim < d; dim++) { + coord[dim] = (short) (getDimForObject(obj, dim) * .5 * scale); + } + pf[i].hilbert = HilbertSpatialSorter.coordinatesToHilbert(coord, h, 16); + } + } + else { // 1-7 bit + final int scale = ~1 >>> 8; + for(int i = 0; i < pf.length; i++) { + NumberVector obj = relation.get(pf[i].id); + byte[] coord = new byte[d]; + for(int dim = 0; dim < d; dim++) { + coord[dim] = (byte) (getDimForObject(obj, dim) * .5 * scale); + } + pf[i].hilbert = HilbertSpatialSorter.coordinatesToHilbert(coord, h, 24); + } + } + java.util.Arrays.sort(pf); + // Update levels + for(int i = 0; i < pf.length - 1; i++) { + pf[i].level = minRegLevel(i, i + 1); + } + // Count candidates + capital_n_star = 0; + for(int i = 0; i < pf.length; i++) { + if(pf[i].ubound >= omega_star) { + capital_n_star++; + } + } + } + + /** + * updateOUT function inserts pf[i] in out. + * + * @param i position in pf of the feature to be inserted + */ + private void updateOUT(int i) { + if(out.size() < n) { + out.offer(pf[i]); + } + else { + HilFeature head = out.peek(); + if(pf[i].ubound > head.ubound) { + // replace smallest + out.poll(); + // assert (out.peek().ubound >= head.ubound); + out.offer(pf[i]); + } + } + } + + /** + * updateWLB function inserts pf[i] in wlb. + * + * @param i position in pf of the feature to be inserted + */ + private void updateWLB(int i) { + if(wlb.size() < n) { + wlb.offer(pf[i]); + } + else { + HilFeature head = wlb.peek(); + if(pf[i].lbound > head.lbound) { + // replace smallest + wlb.poll(); + // assert (wlb.peek().lbound >= head.lbound); + wlb.offer(pf[i]); + } + } + } + + /** + * fastUpperBound function calculates an upper Bound as k*maxDist(pf[i], + * smallest neighborhood) + * + * @param i position in pf of the feature for which the bound should be + * calculated + */ + private double fastUpperBound(int i) { + int pre = i; + int post = i; + while(post - pre < k) { + int pre_level = (pre - 1 >= 0) ? pf[pre - 1].level : -2; + int post_level = (post < capital_n - 1) ? pf[post].level : -2; + if(post_level >= pre_level) { + post++; + } + else { + pre--; + } + } + return k * maxDistLevel(pf[i].id, minRegLevel(pre, post)); + } + + /** + * minDist function calculate the minimal Distance from Vector p to the + * border of the corresponding r-region at the given level + * + * @param id Object ID + * @param level Level of the corresponding r-region + */ + private double minDistLevel(DBID id, int level) { + final NumberVector obj = relation.get(id); + // level 1 is supposed to have r=1 as in the original publication + // 2 ^ - (level - 1) + final double r = 1.0 / (1 << (level - 1)); + double dist = Double.POSITIVE_INFINITY; + for(int dim = 0; dim < d; dim++) { + final double p_m_r = getDimForObject(obj, dim) % r; + dist = Math.min(dist, Math.min(p_m_r, r - p_m_r)); + } + return dist * diameter; + } + + /** + * maxDist function calculate the maximal Distance from Vector p to the + * border of the corresponding r-region at the given level + * + * @param id Object ID + * @param level Level of the corresponding r-region + */ + private double maxDistLevel(DBID id, int level) { + final NumberVector obj = relation.get(id); + // level 1 is supposed to have r=1 as in the original publication + final double r = 1.0 / (1 << (level - 1)); + double dist; + if(t == 1.0) { + dist = 0.0; + for(int dim = 0; dim < d; dim++) { + final double p_m_r = getDimForObject(obj, dim) % r; + // assert (p_m_r >= 0); + dist += Math.max(p_m_r, r - p_m_r); + } + } + else if(t == 2.0) { + dist = 0.0; + for(int dim = 0; dim < d; dim++) { + final double p_m_r = getDimForObject(obj, dim) % r; + // assert (p_m_r >= 0); + double a = Math.max(p_m_r, r - p_m_r); + dist += a * a; + } + dist = Math.sqrt(dist); + } + else if(!Double.isInfinite(t)) { + dist = 0.0; + for(int dim = 0; dim < d; dim++) { + final double p_m_r = getDimForObject(obj, dim) % r; + dist += Math.pow(Math.max(p_m_r, r - p_m_r), t); + } + dist = Math.pow(dist, 1.0 / t); + } + else { + dist = Double.NEGATIVE_INFINITY; + for(int dim = 0; dim < d; dim++) { + final double p_m_r = getDimForObject(obj, dim) % r; + dist = Math.max(dist, Math.max(p_m_r, r - p_m_r)); + } + } + return dist * diameter; + } + + /** + * Number of levels shared + * + * @param a First bitset + * @param b Second bitset + * @return Number of level shared + */ + private int numberSharedLevels(long[] a, long[] b) { + for(int i = 0, j = a.length - 1; i < a.length; i++, j--) { + final long diff = a[j] ^ b[j]; + if(diff != 0) { + // expected unused = available - used + final int expected = (a.length * Long.SIZE) - (d * h); + return ((BitsUtil.numberOfLeadingZeros(diff) + i * Long.SIZE) - expected) / d; + } + } + return h - 1; + } + + /** + * minReg function calculate the minimal r-region level containing two + * points + * + * @param a index of first point in pf + * @param b index of second point in pf + * + * @return Level of the r-region + */ + private int minRegLevel(int a, int b) { + // Sanity test: first level different -> region of level 0, r=2 + // all same: level h - 1 + return numberSharedLevels(pf[a].hilbert, pf[b].hilbert); + } + + /** + * Level of the maximum region containing ref but not q + * + * @param ref Reference point + * @param q Query point + * @return Number of bits shared across all dimensions + */ + private int maxRegLevel(int ref, int q) { + // Sanity test: first level different -> region of level 1, r=1 + // all same: level h + return numberSharedLevels(pf[ref].hilbert, pf[q].hilbert) + 1; + } + + /** + * boxRadius function calculate the Boxradius + * + * @param i index of first point + * @param a index of second point + * @param b index of third point + * + * @return box radius + */ + private double boxRadius(int i, int a, int b) { + // level are inversely ordered to box sizes. min -> max + final int level; + if(a < 0) { + if(b >= pf.length) { + return Double.POSITIVE_INFINITY; + } + level = maxRegLevel(i, b); + } + else if(b >= pf.length) { + level = maxRegLevel(i, a); + } + else { + level = Math.max(maxRegLevel(i, a), maxRegLevel(i, b)); + } + return minDistLevel(pf[i].id, level); + } + + /** + * Get the (projected) position of the object in dimension dim. + * + * @param obj Object + * @param dim Dimension + * @return Projected and shifted position + */ + private double getDimForObject(NumberVector obj, int dim) { + return (obj.doubleValue(dim + 1) - min[dim]) / diameter + shift; + } + } + + /** + * Hilbert representation of a single object. + * + * Details of this representation are discussed in the main HilOut + * publication, see "point features". + * + * @author Jonathan von Brünken + */ + final static class HilFeature implements Comparable { + /** + * Object ID + */ + public DBID id; + + /** + * Hilbert representation + * + * TODO: use byte[] to save some memory, but slower? + */ + public long[] hilbert = null; + + /** + * Object level + */ + public int level = 0; + + /** + * Upper bound for object + */ + public double ubound = Double.POSITIVE_INFINITY; + + /** + * Lower bound of object + */ + public double lbound = 0.0; + + /** + * Heap with the nearest known neighbors + */ + public Heap nn; + + /** + * Set representation of the nearest neighbors for faster lookups + */ + public HashSetModifiableDBIDs nn_keys = DBIDUtil.newHashSet(); + + /** + * Current weight (sum of nn distances) + */ + public double sum_nn = 0.0; + + /** + * Constructor. + * + * @param id Object ID + * @param nn Heap for neighbors + */ + public HilFeature(DBID id, Heap nn) { + super(); + this.id = id; + this.nn = nn; + } + + @Override + public int compareTo(HilFeature o) { + return BitsUtil.compare(this.hilbert, o.hilbert); + } + + /** + * insert function inserts a nearest neighbor into a features nn list and + * its distance + * + * @param id DBID of the nearest neighbor + * @param dt distance or the neighbor to the features position + * @param k K + */ + protected void insert(DBID id, double dt, int k) { + // assert (!nn_keys.contains(id)); + if(nn.size() < k) { + DoubleDistanceResultPair entry = new DoubleDistanceResultPair(dt, id); + nn.offer(entry); + nn_keys.add(id); + sum_nn += dt; + } + else { + DoubleDistanceResultPair head = nn.peek(); + if(dt < head.getDoubleDistance()) { + head = nn.poll(); // Remove worst + sum_nn -= head.getDoubleDistance(); + nn_keys.remove(head.getDBID()); + + // assert (nn.peek().getDoubleDistance() <= head.getDoubleDistance()); + + DoubleDistanceResultPair entry = new DoubleDistanceResultPair(dt, id); + nn.offer(entry); + nn_keys.add(id); + sum_nn += dt; + } + } + + } + } + + /** + * Parameterization class + * + * @author Jonathan von Brünken + * + * @apiviz.exclude + * + * @param Vector type + */ + public static class Parameterizer> extends AbstractParameterizer { + /** + * Parameter to specify how many next neighbors should be used in the + * computation + */ + public static final OptionID K_ID = OptionID.getOrCreateOptionID("HilOut.k", "Compute up to k next neighbors"); + + /** + * Parameter to specify how many outliers should be computed + */ + public static final OptionID N_ID = OptionID.getOrCreateOptionID("HilOut.n", "Compute n outliers"); + + /** + * Parameter to specify the maximum Hilbert-Level + */ + public static final OptionID H_ID = OptionID.getOrCreateOptionID("HilOut.h", "Max. Hilbert-Level"); + + /** + * Parameter to specify p of LP-NormDistance + */ + public static final OptionID T_ID = OptionID.getOrCreateOptionID("HilOut.t", "t of Lt Metric"); + + /** + * Parameter to specify if only the Top n, or also approximations for the + * other elements, should be returned + */ + public static final OptionID TN_ID = OptionID.getOrCreateOptionID("HilOut.tn", "output of Top n or all elements"); + + /** + * Neighborhood size + */ + protected int k = 5; + + /** + * Top-n candidates to compute exactly + */ + protected int n = 10; + + /** + * Hilbert curve precision + */ + protected int h = 32; + + /** + * LPNorm distance function + */ + protected LPNormDistanceFunction distfunc; + + /** + * Scores to report: all or top-n only + */ + protected Enum tn; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + + final IntParameter kP = new IntParameter(K_ID, 5); + if(config.grab(kP)) { + k = kP.getValue(); + } + + final IntParameter nP = new IntParameter(N_ID, 10); + if(config.grab(nP)) { + n = nP.getValue(); + } + + final IntParameter hP = new IntParameter(H_ID, 32); + if(config.grab(hP)) { + h = hP.getValue(); + } + + ObjectParameter distP = AbstractDistanceBasedAlgorithm.makeParameterDistanceFunction(EuclideanDistanceFunction.class, LPNormDistanceFunction.class); + if (config.grab(distP)) { + distfunc = distP.instantiateClass(config); + } + + final EnumParameter tnP = new EnumParameter(TN_ID, ScoreType.class, ScoreType.TopN); + if(config.grab(tnP)) { + tn = tnP.getValue(); + } + } + + @Override + protected HilOut makeInstance() { + return new HilOut(distfunc, k, n, h, tn); + } + } +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/INFLO.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/INFLO.java index 083a72a6..1fe5fe71 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/INFLO.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/INFLO.java @@ -30,7 +30,7 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery; @@ -43,6 +43,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.math.Mean; import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta; @@ -120,9 +121,14 @@ public class INFLO> extends AbstractDistanceBa this.k = k; } - @Override - public OutlierResult run(Database database) throws IllegalStateException { - Relation relation = database.getRelation(getInputTypeRestriction()[0]); + /** + * Run the algorithm + * + * @param database Database to process + * @param relation Relation to process + * @return Outlier result + */ + public OutlierResult run(Database database, Relation relation) { DistanceQuery distFunc = database.getDistanceQuery(relation, getDistanceFunction()); ModifiableDBIDs processedIDs = DBIDUtil.newHashSet(relation.size()); @@ -134,15 +140,15 @@ public class INFLO> extends AbstractDistanceBa // density WritableDoubleDataStore density = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT); // init knns and rnns - for(DBID id : relation.iterDBIDs()) { - knns.put(id, DBIDUtil.newArray()); - rnns.put(id, DBIDUtil.newArray()); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + knns.put(iditer, DBIDUtil.newArray()); + rnns.put(iditer, DBIDUtil.newArray()); } // TODO: use kNN preprocessor? KNNQuery knnQuery = database.getKNNQuery(distFunc, k, DatabaseQuery.HINT_HEAVY_USE); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter id = relation.iterDBIDs(); id.valid(); id.advance()) { // if not visited count=0 int count = rnns.get(id).size(); ModifiableDBIDs s; @@ -158,7 +164,7 @@ public class INFLO> extends AbstractDistanceBa else { s = knns.get(id); } - for(DBID q : s) { + for (DBIDIter q = s.iter(); q.valid(); q.advance()) { if(!processedIDs.contains(q)) { // TODO: use exactly k neighbors? KNNResult listQ = knnQuery.getKNNForDBID(q, k); @@ -182,20 +188,18 @@ public class INFLO> extends AbstractDistanceBa // IF Object is pruned INFLO=1.0 DoubleMinMax inflominmax = new DoubleMinMax(); WritableDoubleDataStore inflos = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter id = relation.iterDBIDs(); id.valid(); id.advance()) { if(!pruned.contains(id)) { ModifiableDBIDs knn = knns.get(id); ModifiableDBIDs rnn = rnns.get(id); double denP = density.doubleValue(id); knn.addDBIDs(rnn); - double den = 0; - for(DBID q : knn) { - double denQ = density.doubleValue(q); - den = den + denQ; + Mean mean = new Mean(); + for (DBIDIter iter = knn.iter(); iter.valid(); iter.advance()) { + mean.put(density.doubleValue(iter)); } - den = den / rnn.size(); - den = den / denP; + double den = mean.getMean() / denP; inflos.putDouble(id, den); // update minimum and maximum inflominmax.put(den); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/KNNOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/KNNOutlier.java index ee748f99..08be944a 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/KNNOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/KNNOutlier.java @@ -29,7 +29,7 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult; @@ -115,11 +115,11 @@ public class KNNOutlier> extends AbstractDista DoubleMinMax minmax = new DoubleMinMax(); WritableDoubleDataStore knno_score = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); // compute distance to the k nearest neighbor. - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { // distance to the kth nearest neighbor - final KNNResult knns = knnQuery.getKNNForDBID(id, k); + final KNNResult knns = knnQuery.getKNNForDBID(iditer, k); double dkn = knns.getKNNDistance().doubleValue(); - knno_score.putDouble(id, dkn); + knno_score.putDouble(iditer, dkn); minmax.put(dkn); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/KNNWeightOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/KNNWeightOutlier.java index e9657e12..cb3ca2f1 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/KNNWeightOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/KNNWeightOutlier.java @@ -30,7 +30,7 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; @@ -119,15 +119,15 @@ public class KNNWeightOutlier> extends Abstrac // compute distance to the k nearest neighbor. n objects with the highest // distance are flagged as outliers WritableDoubleDataStore knnw_score = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { // compute sum of the distances to the k nearest neighbors - final KNNResult knn = knnQuery.getKNNForDBID(id, k); + final KNNResult knn = knnQuery.getKNNForDBID(iditer, k); double skn = 0; for(DistanceResultPair r : knn) { skn += r.getDistance().doubleValue(); } - knnw_score.putDouble(id, skn); + knnw_score.putDouble(iditer, skn); minmax.put(skn); if(progressKNNWeight != null) { diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LDOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LDOF.java index d9256428..84f5dcc6 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LDOF.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LDOF.java @@ -30,7 +30,7 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; @@ -42,6 +42,7 @@ import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.math.Mean; import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta; @@ -110,7 +111,14 @@ public class LDOF> extends AbstractDistanceBas this.k = k; } - public OutlierResult run(Database database, Relation relation) throws IllegalStateException { + /** + * Run the algorithm + * + * @param database Database to process + * @param relation Relation to process + * @return Outlier result + */ + public OutlierResult run(Database database, Relation relation) { DistanceQuery distFunc = database.getDistanceQuery(relation, getDistanceFunction()); KNNQuery knnQuery = database.getKNNQuery(distFunc, k); @@ -125,29 +133,26 @@ public class LDOF> extends AbstractDistanceBas } FiniteProgress progressLDOFs = logger.isVerbose() ? new FiniteProgress("LDOF_SCORE for objects", relation.size(), logger) : null; - for(DBID id : relation.iterDBIDs()) { - KNNResult neighbors = knnQuery.getKNNForDBID(id, k); - int nsize = neighbors.size() - 1; + Mean dxp = new Mean(), Dxp = new Mean(); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + KNNResult neighbors = knnQuery.getKNNForDBID(iditer, k); // skip the point itself - double dxp = 0; - double Dxp = 0; + dxp.reset(); Dxp.reset(); for(DistanceResultPair neighbor1 : neighbors) { - if(!neighbor1.getDBID().equals(id)) { - dxp += neighbor1.getDistance().doubleValue(); + if(!neighbor1.sameDBID(iditer)) { + dxp.put(neighbor1.getDistance().doubleValue()); for(DistanceResultPair neighbor2 : neighbors) { - if(!neighbor1.getDBID().equals(neighbor2.getDBID()) && !neighbor2.getDBID().equals(id)) { - Dxp += distFunc.distance(neighbor1.getDBID(), neighbor2.getDBID()).doubleValue(); + if(!neighbor1.sameDBID(neighbor2) && !neighbor2.sameDBID(iditer)) { + Dxp.put(distFunc.distance(neighbor1, neighbor2).doubleValue()); } } } } - dxp /= nsize; - Dxp /= (nsize * (nsize - 1)); - Double ldof = dxp / Dxp; - if(ldof.isNaN() || ldof.isInfinite()) { + double ldof = dxp.getMean() / Dxp.getMean(); + if(Double.isNaN(ldof) || Double.isInfinite(ldof)) { ldof = 1.0; } - ldofs.putDouble(id, ldof); + ldofs.putDouble(iditer, ldof); // update maximum ldofminmax.put(ldof); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LOCI.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LOCI.java index cfd8623c..a04aa041 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LOCI.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LOCI.java @@ -35,7 +35,8 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.query.DistanceDBIDResult; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; @@ -136,19 +137,21 @@ public class LOCI> extends AbstractDistanceBas } /** - * Runs the algorithm in the timed evaluation part. + * Run the algorithm + * + * @param database Database to process + * @param relation Relation to process + * @return Outlier result */ - @Override - public OutlierResult run(Database database) throws IllegalStateException { - Relation relation = database.getRelation(getInputTypeRestriction()[0]); + public OutlierResult run(Database database, Relation relation) { DistanceQuery distFunc = database.getDistanceQuery(relation, getDistanceFunction()); RangeQuery rangeQuery = database.getRangeQuery(distFunc); FiniteProgress progressPreproc = logger.isVerbose() ? new FiniteProgress("LOCI preprocessing", relation.size(), logger) : null; // LOCI preprocessing step WritableDataStore> interestingDistances = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_SORTED, ArrayList.class); - for(DBID id : relation.iterDBIDs()) { - List> neighbors = rangeQuery.getRangeForDBID(id, rmax); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DistanceDBIDResult neighbors = rangeQuery.getRangeForDBID(iditer, rmax); // build list of critical distances ArrayList cdist = new ArrayList(neighbors.size() * 2); { @@ -177,7 +180,7 @@ public class LOCI> extends AbstractDistanceBas } } - interestingDistances.put(id, cdist); + interestingDistances.put(iditer, cdist); if(progressPreproc != null) { progressPreproc.incrementProcessed(logger); } @@ -191,8 +194,8 @@ public class LOCI> extends AbstractDistanceBas WritableDoubleDataStore mdef_radius = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); DoubleMinMax minmax = new DoubleMinMax(); - for(DBID id : relation.iterDBIDs()) { - final List cdist = interestingDistances.get(id); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + final List cdist = interestingDistances.get(iditer); final double maxdist = cdist.get(cdist.size() - 1).first; final int maxneig = cdist.get(cdist.size() - 1).second; @@ -201,7 +204,7 @@ public class LOCI> extends AbstractDistanceBas if(maxneig >= nmin) { D range = distFunc.getDistanceFactory().fromDouble(maxdist); // Compute the largest neighborhood we will need. - List> maxneighbors = rangeQuery.getRangeForDBID(id, range); + List> maxneighbors = rangeQuery.getRangeForDBID(iditer, range); // Ensure the set is sorted. Should be a no-op with most indexes. Collections.sort(maxneighbors); // For any critical distance, compute the normalized MDEF score. @@ -221,7 +224,7 @@ public class LOCI> extends AbstractDistanceBas if(ne.getDistance().doubleValue() > r) { break; } - int rn_alphar = elementsAtRadius(interestingDistances.get(ne.getDBID()), alpha_r); + int rn_alphar = elementsAtRadius(interestingDistances.get(ne), alpha_r); mv_n_r_alpha.put(rn_alphar); } // We only use the average and standard deviation @@ -244,8 +247,8 @@ public class LOCI> extends AbstractDistanceBas maxmdefnorm = 1.0; maxnormr = maxdist; } - mdef_norm.putDouble(id, maxmdefnorm); - mdef_radius.putDouble(id, maxnormr); + mdef_norm.putDouble(iditer, maxmdefnorm); + mdef_radius.putDouble(iditer, maxnormr); minmax.put(maxmdefnorm); if(progressLOCI != null) { progressLOCI.incrementProcessed(logger); @@ -255,7 +258,7 @@ public class LOCI> extends AbstractDistanceBas progressLOCI.ensureCompleted(logger); } Relation scoreResult = new MaterializedRelation("LOCI normalized MDEF", "loci-mdef-outlier", TypeUtil.DOUBLE, mdef_norm, relation.getDBIDs()); - OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(minmax.getMin(), minmax.getMax(), Double.POSITIVE_INFINITY, 0.0); + OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 0.0); OutlierResult result = new OutlierResult(scoreMeta, scoreResult); result.addChildResult(new MaterializedRelation("LOCI MDEF Radius", "loci-critical-radius", TypeUtil.DOUBLE, mdef_radius, relation.getDBIDs())); return result; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LOF.java index 85e1aef2..5aba41ec 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LOF.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LOF.java @@ -33,7 +33,7 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStore; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; @@ -51,6 +51,7 @@ import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.math.Mean; import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta; @@ -174,13 +175,15 @@ public class LOF> extends AbstractAlgorithm distanceFunction) { this(k, distanceFunction, distanceFunction); } - + /** * Performs the Generalized LOF_SCORE algorithm on the given database by * calling {@link #doRunInTime}. @@ -239,11 +242,14 @@ public class LOF> extends AbstractAlgorithm doRunInTime(DBIDs ids, KNNQuery kNNRefer, KNNQuery kNNReach, StepProgress stepprog) throws IllegalStateException { + protected LOFResult doRunInTime(DBIDs ids, KNNQuery kNNRefer, KNNQuery kNNReach, StepProgress stepprog) { // Assert we got something if(kNNRefer == null) { throw new AbortException("No kNN queries supported by database for reference neighborhood distance function."); @@ -290,19 +296,19 @@ public class LOF> extends AbstractAlgorithm knnReach) { WritableDoubleDataStore lrds = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); FiniteProgress lrdsProgress = logger.isVerbose() ? new FiniteProgress("LRD", ids.size(), logger) : null; - for(DBID id : ids) { - double sum = 0; - KNNResult neighbors = knnReach.getKNNForDBID(id, k); - int nsize = neighbors.size() - (objectIsInKNN ? 0 : 1); + Mean mean = new Mean(); + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + mean.reset(); + KNNResult neighbors = knnReach.getKNNForDBID(iter, k); for(DistanceResultPair neighbor : neighbors) { - if(objectIsInKNN || !neighbor.getDBID().equals(id)) { - KNNResult neighborsNeighbors = knnReach.getKNNForDBID(neighbor.getDBID(), k); - sum += Math.max(neighbor.getDistance().doubleValue(), neighborsNeighbors.getKNNDistance().doubleValue()); + if(objectIsInKNN || !neighbor.sameDBID(iter)) { + KNNResult neighborsNeighbors = knnReach.getKNNForDBID(neighbor, k); + mean.put(Math.max(neighbor.getDistance().doubleValue(), neighborsNeighbors.getKNNDistance().doubleValue())); } } // Avoid division by 0 - double lrd = (sum > 0) ? nsize / sum : 0.0; - lrds.putDouble(id, lrd); + final double lrd = (mean.getCount() > 0) ? 1 / mean.getMean() : 0.0; + lrds.putDouble(iter, lrd); if(lrdsProgress != null) { lrdsProgress.incrementProcessed(logger); } @@ -328,26 +334,25 @@ public class LOF> extends AbstractAlgorithm 0) { - final KNNResult neighbors = knnRefer.getKNNForDBID(id, k); - int nsize = neighbors.size() - (objectIsInKNN ? 0 : 1); - // skip the point itself - // neighbors.remove(0); - double sum = 0; + final KNNResult neighbors = knnRefer.getKNNForDBID(iter, k); + mean.reset(); for(DistanceResultPair neighbor : neighbors) { - if(objectIsInKNN || !neighbor.getDBID().equals(id)) { - sum += lrds.get(neighbor.getDBID()); + // skip the point itself + if(objectIsInKNN || !neighbor.sameDBID(iter)) { + mean.put(lrds.get(neighbor)); } } - lof = (sum / nsize) / lrdp; + lof = mean.getMean() / lrdp; } else { lof = 1.0; } - lofs.putDouble(id, lof); + lofs.putDouble(iter, lof); // update minimum and maximum lofminmax.put(lof); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LoOP.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LoOP.java index f1c273f6..dc0d26a4 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LoOP.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LoOP.java @@ -32,7 +32,7 @@ import de.lmu.ifi.dbs.elki.database.QueryUtil; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; @@ -47,6 +47,7 @@ import de.lmu.ifi.dbs.elki.index.preprocessed.knn.MaterializeKNNPreprocessor; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; +import de.lmu.ifi.dbs.elki.math.Mean; import de.lmu.ifi.dbs.elki.math.MeanVariance; import de.lmu.ifi.dbs.elki.math.statistics.distribution.NormalDistribution; import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; @@ -206,8 +207,12 @@ public class LoOP> extends AbstractAlgorithm relation) throws IllegalStateException { + public OutlierResult run(Database database, Relation relation) { final double sqrt2 = Math.sqrt(2.0); StepProgress stepprog = logger.isVerbose() ? new StepProgress(5) : null; @@ -226,28 +231,29 @@ public class LoOP> extends AbstractAlgorithm neighbors = knnReach.getKNNForDBID(id, kreach); - double sqsum = 0.0; + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + final KNNResult neighbors = knnReach.getKNNForDBID(iditer, kreach); + mean.reset(); // use first kref neighbors as reference set int ks = 0; for(DistanceResultPair neighbor : neighbors) { - if(objectIsInKNN || !neighbor.getDBID().equals(id)) { + if(objectIsInKNN || !neighbor.sameDBID(iditer)) { double d = neighbor.getDistance().doubleValue(); - sqsum += d * d; + mean.put(d * d); ks++; if(ks >= kreach) { break; } } } - double pdist = lambda * Math.sqrt(sqsum / ks); - pdists.putDouble(id, pdist); + double pdist = lambda * Math.sqrt(mean.getMean()); + pdists.putDouble(iditer, pdist); if(prdsProgress != null) { prdsProgress.incrementProcessed(logger); } @@ -262,25 +268,26 @@ public class LoOP> extends AbstractAlgorithm neighbors = knnComp.getKNNForDBID(id, kcomp); - MeanVariance mv = new MeanVariance(); + MeanVariance mv = new MeanVariance(); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + final KNNResult neighbors = knnComp.getKNNForDBID(iditer, kcomp); + mv.reset(); // use first kref neighbors as comparison set. int ks = 0; for(DistanceResultPair neighbor1 : neighbors) { - if(objectIsInKNN || !neighbor1.getDBID().equals(id)) { - mv.put(pdists.doubleValue(neighbor1.getDBID())); + if(objectIsInKNN || !neighbor1.sameDBID(iditer)) { + mv.put(pdists.doubleValue(neighbor1)); ks++; if(ks >= kcomp) { break; } } } - double plof = Math.max(pdists.doubleValue(id) / mv.getMean(), 1.0); + double plof = Math.max(pdists.doubleValue(iditer) / mv.getMean(), 1.0); if(Double.isNaN(plof) || Double.isInfinite(plof)) { plof = 1.0; } - plofs.putDouble(id, plof); + plofs.putDouble(iditer, plof); mvplof.put((plof - 1.0) * (plof - 1.0)); if(progressPLOFs != null) { @@ -302,8 +309,8 @@ public class LoOP> extends AbstractAlgorithm> extends AbstractDistanc // FIXME: implicit preprocessor. WritableDataStore> nMinPts = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, KNNResult.class); WritableDoubleDataStore coreDistance = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); - WritableDataStore minPtsNeighborhoodSize = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, Integer.class); + WritableIntegerDataStore minPtsNeighborhoodSize = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, -1); // Pass 1 // N_minpts(id) and core-distance(id) - for(DBID id : relation.iterDBIDs()) { - KNNResult minptsNeighbours = knnQuery.getKNNForDBID(id, minpts); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + KNNResult minptsNeighbours = knnQuery.getKNNForDBID(iditer, minpts); D d = minptsNeighbours.getKNNDistance(); - nMinPts.put(id, minptsNeighbours); - coreDistance.putDouble(id, d.doubleValue()); - minPtsNeighborhoodSize.put(id, rangeQuery.getRangeForDBID(id, d).size()); + nMinPts.put(iditer, minptsNeighbours); + coreDistance.putDouble(iditer, d.doubleValue()); + minPtsNeighborhoodSize.put(iditer, rangeQuery.getRangeForDBID(iditer, d).size()); } // Pass 2 WritableDataStore> reachDistance = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, List.class); WritableDoubleDataStore lrds = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { List core = new ArrayList(); double lrd = 0; - for(DistanceResultPair neighPair : nMinPts.get(id)) { - DBID idN = neighPair.getDBID(); - double coreDist = coreDistance.doubleValue(idN); - double dist = distQuery.distance(id, idN).doubleValue(); - Double rd = Math.max(coreDist, dist); + for(DistanceResultPair neighPair : nMinPts.get(iditer)) { + double coreDist = coreDistance.doubleValue(neighPair); + double dist = distQuery.distance(iditer, neighPair).doubleValue(); + double rd = Math.max(coreDist, dist); lrd = rd + lrd; core.add(rd); } - lrd = (minPtsNeighborhoodSize.get(id) / lrd); - reachDistance.put(id, core); - lrds.putDouble(id, lrd); + lrd = minPtsNeighborhoodSize.intValue(iditer) / lrd; + reachDistance.put(iditer, core); + lrds.putDouble(iditer, lrd); } // Pass 3 DoubleMinMax ofminmax = new DoubleMinMax(); WritableDoubleDataStore ofs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double of = 0; - for(DistanceResultPair pair : nMinPts.get(id)) { - DBID idN = pair.getDBID(); - double lrd = lrds.doubleValue(id); - double lrdN = lrds.doubleValue(idN); + for(DistanceResultPair pair : nMinPts.get(iditer)) { + double lrd = lrds.doubleValue(iditer); + double lrdN = lrds.doubleValue(pair); of = of + lrdN / lrd; } - of = of / minPtsNeighborhoodSize.get(id); - ofs.putDouble(id, of); + of = of / minPtsNeighborhoodSize.intValue(iditer); + ofs.putDouble(iditer, of); // update minimum and maximum ofminmax.put(of); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OUTRES.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OUTRES.java deleted file mode 100644 index 912f878a..00000000 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OUTRES.java +++ /dev/null @@ -1,368 +0,0 @@ -package de.lmu.ifi.dbs.elki.algorithm.outlier; - -/* - This file is part of ELKI: - Environment for Developing KDD-Applications Supported by Index-Structures - - Copyright (C) 2012 - Ludwig-Maximilians-Universität München - Lehr- und Forschungseinheit für Datenbanksysteme - ELKI Development Team - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see . - */ - -import java.util.Arrays; -import java.util.BitSet; -import java.util.List; - -import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm; -import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.data.type.TypeInformation; -import de.lmu.ifi.dbs.elki.data.type.TypeUtil; -import de.lmu.ifi.dbs.elki.database.QueryUtil; -import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; -import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; -import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; -import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; -import de.lmu.ifi.dbs.elki.database.query.DoubleDistanceResultPair; -import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; -import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; -import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; -import de.lmu.ifi.dbs.elki.logging.Logging; -import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; -import de.lmu.ifi.dbs.elki.math.DoubleMinMax; -import de.lmu.ifi.dbs.elki.math.MeanVariance; -import de.lmu.ifi.dbs.elki.math.statistics.EpanechnikovKernelDensityFunction; -import de.lmu.ifi.dbs.elki.math.statistics.KernelDensityFunction; -import de.lmu.ifi.dbs.elki.math.statistics.distribution.GammaDistribution; -import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta; -import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; -import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; -import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; -import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; - -/** - * Adaptive outlierness for subspace outlier ranking (OUTRES). - * - * Note: this algorithm seems to have a O(n^3) complexity without appropriate - * index structures to accelerate range queries: each object in each tested - * subspace will need to know the mean and standard deviation of the density of - * the neighbors, which in turn needs another range query. - * - * Reference: - *

- * E. Müller, M. Schiffer, T. Seidl
- * Adaptive outlierness for subspace outlier ranking
- * in: Proc. 19th ACM International Conference on Information and knowledge - * management - *

- * - * @author Pleintinger Viktoria - * @author Erich Schubert - */ -@Reference(authors = "E. Müller, M. Schiffer, T. Seidl", title = "Adaptive outlierness for subspace outlier ranking", booktitle = "Proc. 19th ACM International Conference on Information and knowledge management") -public class OUTRES> extends AbstractAlgorithm implements OutlierAlgorithm { - /** - * The logger for this class. - */ - private static final Logging logger = Logging.getLogger(OUTRES.class); - - /** - * The epsilon (in 2d) parameter - */ - private final double eps; - - /** - * Constant for Kolmogorov-Smirnov at alpha=0.01 (table value) - */ - private static final double K_S_CRITICAL001 = 1.63; - - /** - * Constructor. - * - * @param eps Epsilon - */ - public OUTRES(double eps) { - super(); - this.eps = eps; - } - - /** - * Main loop for OUTRES - * - * @param relation Relation to process - * @return Outlier detection result - */ - public OutlierResult run(Relation relation) { - WritableDoubleDataStore ranks = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); - DoubleMinMax minmax = new DoubleMinMax(); - - KernelDensityEstimator kernel = new KernelDensityEstimator(relation); - BitSet subspace = new BitSet(kernel.dim); - - FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("OutRank scores", relation.size(), logger) : null; - - for(DBID object : relation.iterDBIDs()) { - subspace.clear(); - double score = outresScore(0, subspace, object, kernel); - ranks.putDouble(object, score); - minmax.put(score); - if(progress != null) { - progress.incrementProcessed(logger); - } - } - if(progress != null) { - progress.ensureCompleted(logger); - } - - OutlierScoreMeta meta = new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0., 1., 1.); - OutlierResult outresResult = new OutlierResult(meta, new MaterializedRelation("OUTRES", "outres-score", TypeUtil.DOUBLE, ranks, relation.getDBIDs())); - return outresResult; - } - - /** - * Main loop of OUTRES. Run for each object - * - * @param s start dimension - * @param subspace Current subspace - * @param id Current object ID - * @param kernel Kernel - * @return Score - */ - public double outresScore(final int s, BitSet subspace, DBID id, KernelDensityEstimator kernel) { - double score = 1.0; // Initial score is 1.0 - - for(int i = s; i < kernel.dim; i++) { - if(subspace.get(i)) { // TODO: needed? Or should we always start with i=0? - continue; - } - subspace.set(i); - final SubspaceEuclideanDistanceFunction df = new SubspaceEuclideanDistanceFunction(subspace); - final DoubleDistance range = new DoubleDistance(kernel.adjustedEps(kernel.dim)); - RangeQuery rq = QueryUtil.getRangeQuery(kernel.relation, df, range); - - List> neigh = rq.getRangeForDBID(id, range); - if(neigh.size() > 2) { - // Relevance test - if(relevantSubspace(subspace, neigh, kernel)) { - final double density = kernel.subspaceDensity(subspace, neigh); - final double deviation; - // Compute mean and standard deviation for densities of neighbors. - MeanVariance meanv = new MeanVariance(); - for(DistanceResultPair pair : neigh) { - List> n2 = rq.getRangeForDBID(pair.getDBID(), range); - meanv.put(kernel.subspaceDensity(subspace, n2)); - } - deviation = (meanv.getMean() - density) / (2. * meanv.getSampleStddev()); - // High deviation: - if(deviation >= 1) { - score *= (density / deviation); - } - // Recursion - score *= outresScore(i + 1, subspace, id, kernel); - } - } - subspace.clear(i); - } - return score; - } - - /** - * - * @param test: subspace that will be tested about scattering - * @return if the subspace is scattered return will be 0, else 1 - */ - protected boolean relevantSubspace(BitSet subspace, List> neigh, KernelDensityEstimator kernel) { - Relation relation = kernel.relation; - final double crit = K_S_CRITICAL001 / Math.sqrt(neigh.size()); - - for(int dim = subspace.nextSetBit(0); dim > 0; dim = subspace.nextSetBit(dim + 1)) { - // TODO: can we save this copy somehow? - double[] data = new double[neigh.size()]; - { - int count = 0; - for(DistanceResultPair object : neigh) { - V vector = relation.get(object.getDBID()); - data[count] = vector.doubleValue(dim + 1); - count++; - } - assert (count == neigh.size()); - } - Arrays.sort(data); - - final double norm = data[data.length - 1] - data[0]; - final double min = data[0]; - - // Kolmogorow-Smirnow-Test against uniform distribution: - for(int j = 1; j < data.length - 2; j++) { - double delta = (j / (data.length - 1)) - ((data[j] - min) / norm); - if(Math.abs(delta) > crit) { - return false; - } - } - } - return true; - } - - /** - * Kernel density estimation and utility class. - * - * @author Erich Schubert - */ - protected class KernelDensityEstimator { - /** - * Actual kernel in use - */ - final KernelDensityFunction kernel = EpanechnikovKernelDensityFunction.KERNEL; - - /** - * Relation to retrieve data from - */ - final Relation relation; - - /** - * Epsilon values for different subspace dimensionalities - */ - final double[] epsilons; - - /** - * Optimal bandwidth for a dimensionality of 2 - */ - final double hopttwo; - - /** - * Dimensionality of data set - */ - final int dim; - - /** - * Constructor. - * - * @param relation Relation to apply to - */ - public KernelDensityEstimator(Relation relation) { - super(); - this.relation = relation; - dim = DatabaseUtil.dimensionality(relation); - hopttwo = optimalBandwidth(2); - epsilons = new double[dim + 1]; - Arrays.fill(epsilons, Double.NEGATIVE_INFINITY); - epsilons[2] = OUTRES.this.eps; - } - - /** - * Compute density in the given subspace. - * - * @param subspace Subspace - * @param neighbours Neighbor distance list - * @return Density - */ - protected double subspaceDensity(BitSet subspace, List> neighbours) { - final double bandwidth = optimalBandwidth(subspace.cardinality()); - - // TODO: optimize by moving instanceof outside? - double density = 0; - for(DistanceResultPair pair : neighbours) { - if(pair instanceof DoubleDistanceResultPair) { - density += kernel.density(((DoubleDistanceResultPair) pair).getDoubleDistance() / bandwidth); - } - else { - density += kernel.density(pair.getDistance().doubleValue() / bandwidth); - } - } - - return density / relation.size(); - } - - /** - * Compute optimal kernel bandwidth - * - * @param dim Dimensionality of subspace - * @return optimal bandwidth - */ - protected double optimalBandwidth(int dim) { - // Pi in the publication is redundant and cancels out! - double hopt = 8 * Math.exp(GammaDistribution.logGamma(dim / 2.0 + 1)) * (dim + 4) * Math.pow(2, dim); - return hopt * Math.pow(relation.size(), (-1 / (dim + 4))); - } - - /** - * Rescale the query radius based on the given dimensionality. - * - * @param dim Dimensionality - * @return Query radius - */ - protected double adjustedEps(int dim) { - // Cached - double e = epsilons[dim]; - if(e < 0) { - e = epsilons[2] * optimalBandwidth(dim) / hopttwo; - epsilons[dim] = e; - } - return e; - } - } - - @Override - protected Logging getLogger() { - return logger; - } - - @Override - public TypeInformation[] getInputTypeRestriction() { - return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); - } - - /** - * Parameterization class. - * - * @author Viktoria Pleintinger - * - * @apiviz.exclude - */ - public static class Parameterizer> extends AbstractParameterizer { - /** - * Option ID for Epsilon parameter - */ - public static final OptionID D_ID = OptionID.getOrCreateOptionID("outres.epsilon", "Range value for OUTRES in 2 dimensions."); - - /** - * Query radius - */ - protected double eps; - - @Override - protected void makeOptions(Parameterization config) { - super.makeOptions(config); - final DoubleParameter param = new DoubleParameter(D_ID); - if(config.grab(param)) { - eps = param.getValue(); - } - } - - @Override - protected OUTRES makeInstance() { - return new OUTRES(eps); - } - } -} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OnlineLOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OnlineLOF.java index ad17398c..9b974ad9 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OnlineLOF.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OnlineLOF.java @@ -29,7 +29,7 @@ import de.lmu.ifi.dbs.elki.database.QueryUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; @@ -67,7 +67,6 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; * @author Elke Achtert * * @apiviz.has LOF.LOFResult oneway - - updates - * @apiviz.composedOf OnlineLOF.LOFKNNListener */ // TODO: related to publication? public class OnlineLOF> extends LOF { @@ -170,6 +169,10 @@ public class OnlineLOF> extends LOF { /** * Encapsulates a listener for changes of kNNs used in the online LOF * algorithm. + * + * @author Elke Achtert + * + * @apiviz.exclude */ private class LOFKNNListener implements KNNListener { /** @@ -269,12 +272,12 @@ public class OnlineLOF> extends LOF { ArrayDBIDs affected_lrd_id_candidates = mergeIDs(reachDistRKNNs, lrd_ids); ArrayModifiableDBIDs affected_lrd_ids = DBIDUtil.newArray(affected_lrd_id_candidates.size()); WritableDoubleDataStore new_lrds = computeLRDs(affected_lrd_id_candidates, lofResult.getKNNReach()); - for(DBID id : affected_lrd_id_candidates) { - double new_lrd = new_lrds.doubleValue(id); - double old_lrd = lofResult.getLrds().doubleValue(id); + for (DBIDIter iter = affected_lrd_id_candidates.iter(); iter.valid(); iter.advance()) { + double new_lrd = new_lrds.doubleValue(iter); + double old_lrd = lofResult.getLrds().doubleValue(iter); if(Double.isNaN(old_lrd) || old_lrd != new_lrd) { - lofResult.getLrds().putDouble(id, new_lrd); - affected_lrd_ids.add(id); + lofResult.getLrds().putDouble(iter, new_lrd); + affected_lrd_ids.add(iter); } } @@ -314,9 +317,9 @@ public class OnlineLOF> extends LOF { if(stepprog != null) { stepprog.beginStep(1, "Delete old LRDs and LOFs.", logger); } - for(DBID id : deletions) { - lofResult.getLrds().delete(id); - lofResult.getLofs().delete(id); + for (DBIDIter iter = deletions.iter(); iter.valid(); iter.advance()) { + lofResult.getLrds().delete(iter); + lofResult.getLofs().delete(iter); } // recompute lrds @@ -328,12 +331,12 @@ public class OnlineLOF> extends LOF { ArrayDBIDs affected_lrd_id_candidates = mergeIDs(reachDistRKNNs, lrd_ids); ArrayModifiableDBIDs affected_lrd_ids = DBIDUtil.newArray(affected_lrd_id_candidates.size()); WritableDoubleDataStore new_lrds = computeLRDs(affected_lrd_id_candidates, lofResult.getKNNReach()); - for(DBID id : affected_lrd_id_candidates) { - double new_lrd = new_lrds.doubleValue(id); - double old_lrd = lofResult.getLrds().doubleValue(id); + for (DBIDIter iter = affected_lrd_id_candidates.iter(); iter.valid(); iter.advance()) { + double new_lrd = new_lrds.doubleValue(iter); + double old_lrd = lofResult.getLrds().doubleValue(iter); if(old_lrd != new_lrd) { - lofResult.getLrds().putDouble(id, new_lrd); - affected_lrd_ids.add(id); + lofResult.getLrds().putDouble(iter, new_lrd); + affected_lrd_ids.add(iter); } } @@ -371,7 +374,7 @@ public class OnlineLOF> extends LOF { } for(List> queryResult : queryResults) { for(DistanceResultPair qr : queryResult) { - result.add(qr.getDBID()); + result.add(qr); } } return DBIDUtil.newArray(result); @@ -386,8 +389,8 @@ public class OnlineLOF> extends LOF { private void recomputeLOFs(DBIDs ids, LOFResult lofResult) { Pair lofsAndMax = computeLOFs(ids, lofResult.getLrds(), lofResult.getKNNRefer()); WritableDoubleDataStore new_lofs = lofsAndMax.getFirst(); - for(DBID id : ids) { - lofResult.getLofs().putDouble(id, new_lofs.doubleValue(id)); + for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + lofResult.getLofs().putDouble(iter, new_lofs.doubleValue(iter)); } // track the maximum value for normalization. DoubleMinMax new_lofminmax = lofsAndMax.getSecond(); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OutlierAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OutlierAlgorithm.java index 2b122183..d8322d8b 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OutlierAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OutlierAlgorithm.java @@ -38,5 +38,5 @@ public interface OutlierAlgorithm extends Algorithm { // Note: usually you won't override this method directly, but instead // Use the magic in AbstractAlgorithm and just implement a run method for your input data @Override - OutlierResult run(Database database) throws IllegalStateException; + OutlierResult run(Database database); } \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ReferenceBasedOutlierDetection.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ReferenceBasedOutlierDetection.java index befd03ed..dd1d37a3 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ReferenceBasedOutlierDetection.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ReferenceBasedOutlierDetection.java @@ -37,7 +37,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; import de.lmu.ifi.dbs.elki.database.query.GenericDistanceResultPair; @@ -87,7 +87,7 @@ import de.lmu.ifi.dbs.elki.utilities.referencepoints.ReferencePointsHeuristic; */ @Title("An Efficient Reference-based Approach to Outlier Detection in Large Datasets") @Description("Computes kNN distances approximately, using reference points with various reference point strategies.") -@Reference(authors = "Y. Pei, O.R. Zaiane, Y. Gao", title = "An Efficient Reference-based Approach to Outlier Detection in Large Datasets", booktitle = "Proc. 19th IEEE Int. Conf. on Data Engineering (ICDE '03), Bangalore, India, 2003", url = "http://dx.doi.org/10.1109/ICDM.2006.17") +@Reference(authors = "Y. Pei, O.R. Zaiane, Y. Gao", title = "An Efficient Reference-based Approach to Outlier Detection in Large Datasets", booktitle = "Proc. 6th IEEE Int. Conf. on Data Mining (ICDM '06), Hong Kong, China, 2006", url = "http://dx.doi.org/10.1109/ICDM.2006.17") public class ReferenceBasedOutlierDetection, D extends NumberDistance> extends AbstractAlgorithm implements OutlierAlgorithm { /** * The logger for this class. @@ -164,7 +164,7 @@ public class ReferenceBasedOutlierDetection, D exte for(int l = 0; l < firstReferenceDists.size(); l++) { double density = computeDensity(firstReferenceDists, l); // Initial value - rbod_score.putDouble(firstReferenceDists.get(l).getDBID(), density); + rbod_score.putDouble(firstReferenceDists.get(l), density); } // compute density values for all remaining reference points while(iter.hasNext()) { @@ -174,24 +174,24 @@ public class ReferenceBasedOutlierDetection, D exte for(int l = 0; l < referenceDists.size(); l++) { double density = computeDensity(referenceDists, l); // Update minimum - if(density < rbod_score.doubleValue(referenceDists.get(l).getDBID())) { - rbod_score.putDouble(referenceDists.get(l).getDBID(), density); + if(density < rbod_score.doubleValue(referenceDists.get(l))) { + rbod_score.putDouble(referenceDists.get(l), density); } } } } // compute maximum density double maxDensity = 0.0; - for(DBID id : relation.iterDBIDs()) { - double dens = rbod_score.doubleValue(id); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + double dens = rbod_score.doubleValue(iditer); if(dens > maxDensity) { maxDensity = dens; } } // compute ROS - for(DBID id : relation.iterDBIDs()) { - double score = 1 - (rbod_score.doubleValue(id) / maxDensity); - rbod_score.putDouble(id, score); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + double score = 1 - (rbod_score.doubleValue(iditer) / maxDensity); + rbod_score.putDouble(iditer, score); } // adds reference points to the result. header information for the @@ -218,9 +218,9 @@ public class ReferenceBasedOutlierDetection, D exte protected List> computeDistanceVector(V refPoint, Relation database, DistanceQuery distFunc) { // TODO: optimize for double distances? List> referenceDists = new ArrayList>(database.size()); - for(DBID id : database.iterDBIDs()) { - final D distance = distFunc.distance(id, refPoint); - referenceDists.add(new GenericDistanceResultPair(distance, id)); + for(DBIDIter iditer = database.iterDBIDs(); iditer.valid(); iditer.advance()) { + final D distance = distFunc.distance(iditer, refPoint); + referenceDists.add(new GenericDistanceResultPair(distance, iditer.getDBID())); } Collections.sort(referenceDists); return referenceDists; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/SOD.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/SOD.java deleted file mode 100644 index a09bbcfd..00000000 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/SOD.java +++ /dev/null @@ -1,469 +0,0 @@ -package de.lmu.ifi.dbs.elki.algorithm.outlier; - -/* - This file is part of ELKI: - Environment for Developing KDD-Applications Supported by Index-Structures - - Copyright (C) 2012 - Ludwig-Maximilians-Universität München - Lehr- und Forschungseinheit für Datenbanksysteme - ELKI Development Team - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see . - */ - -import java.util.BitSet; -import java.util.Iterator; - -import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; -import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; -import de.lmu.ifi.dbs.elki.data.type.TypeInformation; -import de.lmu.ifi.dbs.elki.data.type.TypeUtil; -import de.lmu.ifi.dbs.elki.database.Database; -import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; -import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; -import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; -import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.ids.DBID; -import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; -import de.lmu.ifi.dbs.elki.database.ids.DBIDs; -import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery; -import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; -import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; -import de.lmu.ifi.dbs.elki.distance.similarityfunction.SharedNearestNeighborSimilarityFunction; -import de.lmu.ifi.dbs.elki.distance.similarityfunction.SimilarityFunction; -import de.lmu.ifi.dbs.elki.logging.Logging; -import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; -import de.lmu.ifi.dbs.elki.math.DoubleMinMax; -import de.lmu.ifi.dbs.elki.result.ResultHierarchy; -import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta; -import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; -import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; -import de.lmu.ifi.dbs.elki.result.textwriter.TextWriteable; -import de.lmu.ifi.dbs.elki.result.textwriter.TextWriterStream; -import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; -import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.Heap; -import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.TiedTopBoundedHeap; -import de.lmu.ifi.dbs.elki.utilities.documentation.Description; -import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; -import de.lmu.ifi.dbs.elki.utilities.documentation.Title; -import de.lmu.ifi.dbs.elki.utilities.iterator.IterableIterator; -import de.lmu.ifi.dbs.elki.utilities.iterator.IterableUtil; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; -import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleObjPair; - -/** - * @author Arthur Zimek - * - * @apiviz.has SODModel oneway - - computes - * @apiviz.has SharedNearestNeighborSimilarityFunction - * - * @param the type of NumberVector handled by this Algorithm - */ -// todo arthur comment -@Title("SOD: Subspace outlier degree") -@Description("Outlier Detection in Axis-Parallel Subspaces of High Dimensional Data") -@Reference(authors = "H.-P. Kriegel, P. Kröger, E. Schubert, A. Zimek", title = "Outlier Detection in Axis-Parallel Subspaces of High Dimensional Data", booktitle = "Proceedings of the 13th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD), Bangkok, Thailand, 2009", url = "http://dx.doi.org/10.1007/978-3-642-01307-2") -public class SOD, D extends NumberDistance> extends AbstractAlgorithm implements OutlierAlgorithm { - /** - * The logger for this class. - */ - private static final Logging logger = Logging.getLogger(SOD.class); - - /** - * Parameter to specify the number of shared nearest neighbors to be - * considered for learning the subspace properties., must be an integer - * greater than 0. - */ - public static final OptionID KNN_ID = OptionID.getOrCreateOptionID("sod.knn", "The number of most snn-similar objects to use as reference set for learning the subspace properties."); - - /** - * Parameter to indicate the multiplier for the discriminance value for - * discerning small from large variances. - */ - public static final OptionID ALPHA_ID = OptionID.getOrCreateOptionID("sod.alpha", "The multiplier for the discriminance value for discerning small from large variances."); - - /** - * Parameter for the similarity function. - */ - public static final OptionID SIM_ID = OptionID.getOrCreateOptionID("sod.similarity", "The similarity function used for the neighborhood set."); - - /** - * Holds the value of {@link #KNN_ID}. - */ - private int knn; - - /** - * Holds the value of {@link #ALPHA_ID}. - */ - private double alpha; - - /** - * The similarity function {@link #SIM_ID}. - */ - private SimilarityFunction similarityFunction; - - /** - * Constructor with parameters. - * - * @param knn knn value - * @param alpha Alpha parameter - * @param similarityFunction Shared nearest neighbor similarity function - */ - public SOD(int knn, double alpha, SimilarityFunction similarityFunction) { - super(); - this.knn = knn; - this.alpha = alpha; - this.similarityFunction = similarityFunction; - } - - /** - * Performs the SOD algorithm on the given database. - * - * @param relation Data relation to process - */ - public OutlierResult run(Relation relation) throws IllegalStateException { - SimilarityQuery snnInstance = similarityFunction.instantiate(relation); - FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Assigning Subspace Outlier Degree", relation.size(), logger) : null; - WritableDataStore> sod_models = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, SODModel.class); - DoubleMinMax minmax = new DoubleMinMax(); - for(Iterator iter = relation.iterDBIDs(); iter.hasNext();) { - DBID queryObject = iter.next(); - if(progress != null) { - progress.incrementProcessed(logger); - } - DBIDs knnList = getNearestNeighbors(relation, snnInstance, queryObject); - SODModel model = new SODModel(relation, knnList, alpha, relation.get(queryObject)); - sod_models.put(queryObject, model); - minmax.put(model.getSod()); - } - if(progress != null) { - progress.ensureCompleted(logger); - } - // combine results. - Relation> models = new MaterializedRelation>("Subspace Outlier Model", "sod-outlier", new SimpleTypeInformation>(SODModel.class), sod_models, relation.getDBIDs()); - OutlierScoreMeta meta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax()); - OutlierResult sodResult = new OutlierResult(meta, new SODProxyScoreResult(models, relation.getDBIDs())); - // also add the models. - sodResult.addChildResult(models); - return sodResult; - } - - /** - * Provides the k nearest neighbors in terms of the shared nearest neighbor - * distance. - *

- * The query object is excluded from the knn list. - * - * @param relation the database holding the objects - * @param simQ similarity function - * @param queryObject the query object for which the kNNs should be determined - * @return the k nearest neighbors in terms of the shared nearest neighbor - * distance without the query object - */ - private DBIDs getNearestNeighbors(Relation relation, SimilarityQuery simQ, DBID queryObject) { - // similarityFunction.getPreprocessor().getParameters(); - Heap> nearestNeighbors = new TiedTopBoundedHeap>(knn); - for(DBID id : relation.iterDBIDs()) { - if(!id.equals(queryObject)) { - double sim = simQ.similarity(queryObject, id).doubleValue(); - if(sim > 0) { - nearestNeighbors.add(new DoubleObjPair(sim, id)); - } - } - } - // Collect DBIDs - ArrayModifiableDBIDs dbids = DBIDUtil.newArray(nearestNeighbors.size()); - while(nearestNeighbors.size() > 0) { - final DoubleObjPair next = nearestNeighbors.poll(); - dbids.add(next.second); - } - return dbids; - } - - @Override - public TypeInformation[] getInputTypeRestriction() { - return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); - } - - @Override - protected Logging getLogger() { - return logger; - } - - /** - * - * - * @author Arthur Zimek - * @param the type of DatabaseObjects handled by this Result - */ - // TODO: arthur comment - public static class SODModel> implements TextWriteable, Comparable> { - private double[] centerValues; - - private V center; - - private double[] variances; - - private double expectationOfVariance; - - private BitSet weightVector; - - private double sod; - - /** - * Initialize SOD Model - * - * @param relation Database - * @param neighborhood Neighborhood - * @param alpha Alpha value - * @param queryObject Query object - */ - public SODModel(Relation relation, DBIDs neighborhood, double alpha, V queryObject) { - if(neighborhood.size() > 0) { - // TODO: store database link? - centerValues = new double[DatabaseUtil.dimensionality(relation)]; - variances = new double[centerValues.length]; - for(DBID id : neighborhood) { - V databaseObject = relation.get(id); - for(int d = 0; d < centerValues.length; d++) { - centerValues[d] += databaseObject.doubleValue(d + 1); - } - } - for(int d = 0; d < centerValues.length; d++) { - centerValues[d] /= neighborhood.size(); - } - for(DBID id : neighborhood) { - V databaseObject = relation.get(id); - for(int d = 0; d < centerValues.length; d++) { - // distance - double distance = centerValues[d] - databaseObject.doubleValue(d + 1); - // variance - variances[d] += distance * distance; - } - } - expectationOfVariance = 0; - for(int d = 0; d < variances.length; d++) { - variances[d] /= neighborhood.size(); - expectationOfVariance += variances[d]; - } - expectationOfVariance /= variances.length; - weightVector = new BitSet(variances.length); - for(int d = 0; d < variances.length; d++) { - if(variances[d] < alpha * expectationOfVariance) { - weightVector.set(d, true); - } - } - center = DatabaseUtil.assumeVectorField(relation).getFactory().newNumberVector(centerValues); - sod = subspaceOutlierDegree(queryObject, center, weightVector); - } - else { - center = queryObject; - sod = 0.0; - } - } - - /** - * Compute SOD score - * - * @param queryObject - * @param center - * @param weightVector - * @return sod value - */ - private double subspaceOutlierDegree(V queryObject, V center, BitSet weightVector) { - final SubspaceEuclideanDistanceFunction df = new SubspaceEuclideanDistanceFunction(weightVector); - final int card = weightVector.cardinality(); - if(card == 0) { - return 0; - } - double distance = df.distance(queryObject, center).doubleValue(); - distance /= card; - return distance; - } - - /** - * Return the SOD of the point. - * - * @return sod value - */ - public double getSod() { - return this.sod; - } - - @Override - public void writeToText(TextWriterStream out, String label) { - out.inlinePrint(label + "=" + this.sod); - out.commentPrintLn(this.getClass().getSimpleName() + ":"); - out.commentPrintLn("relevant attributes (counting starts with 0): " + this.weightVector.toString()); - out.commentPrintLn("center of neighborhood: " + out.normalizationRestore(center).toString()); - out.commentPrintLn("subspace outlier degree: " + this.sod); - out.commentPrintSeparator(); - } - - @Override - public int compareTo(SODModel o) { - return Double.compare(this.getSod(), o.getSod()); - } - - } - - /** - * Proxy class that converts a model result to an actual SOD score result. - * - * @author Erich Schubert - * - * @apiviz.exclude - */ - protected static class SODProxyScoreResult implements Relation { - /** - * Model result this is a proxy for. - */ - Relation> models; - - /** - * The IDs we are defined for - */ - DBIDs dbids; - - /** - * Constructor. - * - * @param models Models result - * @param dbids IDs we are defined for - */ - public SODProxyScoreResult(Relation> models, DBIDs dbids) { - super(); - this.models = models; - this.dbids = dbids; - } - - @Override - public Double get(DBID objID) { - return models.get(objID).getSod(); - } - - @Override - public String getLongName() { - return "Subspace Outlier Degree"; - } - - @Override - public String getShortName() { - return "sod-outlier"; - } - - @Override - public DBIDs getDBIDs() { - return dbids; - } - - @Override - public IterableIterator iterDBIDs() { - return IterableUtil.fromIterator(dbids.iterator()); - } - - @Override - public Database getDatabase() { - return null; // FIXME - } - - @Override - public void set(DBID id, Double val) { - throw new UnsupportedOperationException(); - } - - @Override - public void delete(DBID id) { - throw new UnsupportedOperationException(); - } - - @Override - public SimpleTypeInformation getDataTypeInformation() { - return TypeUtil.DOUBLE; - } - - @Override - public int size() { - return dbids.size(); - } - - @Override - public ResultHierarchy getHierarchy() { - return models.getHierarchy(); - } - - @Override - public void setHierarchy(ResultHierarchy hierarchy) { - models.setHierarchy(hierarchy); - } - } - - /** - * Parameterization class. - * - * @author Erich Schubert - * - * @apiviz.exclude - */ - public static class Parameterizer, D extends NumberDistance> extends AbstractParameterizer { - /** - * Holds the value of {@link #KNN_ID}. - */ - private int knn = 1; - - /** - * Holds the value of {@link #ALPHA_ID}. - */ - private double alpha = 1.1; - - /** - * The similarity function - {@link #SIM_ID}. - */ - private SimilarityFunction similarityFunction; - - @Override - protected void makeOptions(Parameterization config) { - super.makeOptions(config); - final ObjectParameter> simP = new ObjectParameter>(SIM_ID, SimilarityFunction.class, SharedNearestNeighborSimilarityFunction.class); - if(config.grab(simP)) { - similarityFunction = simP.instantiateClass(config); - } - - final IntParameter knnP = new IntParameter(KNN_ID, new GreaterConstraint(0)); - if(config.grab(knnP)) { - knn = knnP.getValue(); - } - - final DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, new GreaterConstraint(0), 1.1); - if(config.grab(alphaP)) { - alpha = alphaP.getValue(); - } - } - - @Override - protected SOD makeInstance() { - return new SOD(knn, alpha, similarityFunction); - } - } -} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/ExternalDoubleOutlierScore.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/ExternalDoubleOutlierScore.java index 22447454..1542b8e3 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/ExternalDoubleOutlierScore.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/ExternalDoubleOutlierScore.java @@ -40,7 +40,7 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -142,7 +142,7 @@ public class ExternalDoubleOutlierScore extends AbstractAlgorithm public OutlierResult run(Database database, Relation relation) { WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); - Pattern colSep = Pattern.compile(AbstractParser.WHITESPACE_PATTERN); + Pattern colSep = Pattern.compile(AbstractParser.DEFAULT_SEPARATOR); DoubleMinMax minmax = new DoubleMinMax(); InputStream in; try { @@ -210,10 +210,10 @@ public class ExternalDoubleOutlierScore extends AbstractAlgorithm ((OutlierScalingFunction) scaling).prepare(or); } DoubleMinMax mm = new DoubleMinMax(); - for(DBID id : relation.iterDBIDs()) { - double val = scoresult.get(id); // scores.get(id); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + double val = scoresult.get(iditer); val = scaling.getScaled(val); - scores.putDouble(id, val); + scores.putDouble(iditer, val); mm.put(val); } meta = new BasicOutlierScoreMeta(mm.getMin(), mm.getMax()); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/FeatureBagging.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/FeatureBagging.java index c8da9501..407b7400 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/FeatureBagging.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/FeatureBagging.java @@ -36,7 +36,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction; @@ -50,7 +50,6 @@ import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; -import de.lmu.ifi.dbs.elki.utilities.iterator.IterableIterator; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; @@ -163,14 +162,14 @@ public class FeatureBagging extends AbstractAlgorithm implements DoubleMinMax minmax = new DoubleMinMax(); if(breadth) { FiniteProgress cprog = logger.isVerbose() ? new FiniteProgress("Combining results", relation.size(), logger) : null; - Pair, Relation>[] IDVectorOntoScoreVector = Pair.newPairArray(results.size()); + Pair>[] IDVectorOntoScoreVector = Pair.newPairArray(results.size()); // Mapping score-sorted DBID-Iterators onto their corresponding scores. // We need to initialize them now be able to iterate them "in parallel". { int i = 0; for(OutlierResult r : results) { - IDVectorOntoScoreVector[i] = new Pair, Relation>(r.getOrdering().iter(relation.getDBIDs()), r.getScores()); + IDVectorOntoScoreVector[i] = new Pair>(r.getOrdering().iter(relation.getDBIDs()).iter(), r.getScores()); i++; } } @@ -178,17 +177,17 @@ public class FeatureBagging extends AbstractAlgorithm implements // Iterating over the *lines* of the AS_t(i)-matrix. for(int i = 0; i < relation.size(); i++) { // Iterating over the elements of a line (breadth-first). - for(Pair, Relation> pair : IDVectorOntoScoreVector) { - IterableIterator iter = pair.first; + for(Pair> pair : IDVectorOntoScoreVector) { + DBIDIter iter = pair.first; // Always true if every algorithm returns a complete result (one score // for every DBID). - if(iter.hasNext()) { - DBID tmpID = iter.next(); - double score = pair.second.get(tmpID); - if(Double.isNaN(scores.doubleValue(tmpID))) { - scores.putDouble(tmpID, score); + if(iter.valid()) { + double score = pair.second.get(iter); + if(Double.isNaN(scores.doubleValue(iter))) { + scores.putDouble(iter, score); minmax.put(score); } + iter.advance(); } else { logger.warning("Incomplete result: Iterator does not contain |DB| DBIDs"); @@ -205,15 +204,15 @@ public class FeatureBagging extends AbstractAlgorithm implements } else { FiniteProgress cprog = logger.isVerbose() ? new FiniteProgress("Combining results", relation.size(), logger) : null; - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) { double sum = 0.0; for(OutlierResult r : results) { - final Double s = r.getScores().get(id); + final Double s = r.getScores().get(iter); if (s != null && !Double.isNaN(s)) { sum += s; } } - scores.putDouble(id, sum); + scores.putDouble(iter, sum); minmax.put(sum); if(cprog != null) { cprog.incrementProcessed(logger); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/HiCS.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/HiCS.java new file mode 100644 index 00000000..73d4156a --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/HiCS.java @@ -0,0 +1,633 @@ +package de.lmu.ifi.dbs.elki.algorithm.outlier.meta; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ + +import java.util.ArrayList; +import java.util.BitSet; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Random; +import java.util.Set; +import java.util.TreeSet; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.outlier.LOF; +import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.VectorUtil; +import de.lmu.ifi.dbs.elki.data.VectorUtil.SortDBIDsBySingleDimension; +import de.lmu.ifi.dbs.elki.data.projection.NumericalFeatureSelection; +import de.lmu.ifi.dbs.elki.data.projection.Projection; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.ProxyDatabase; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; +import de.lmu.ifi.dbs.elki.database.relation.ProjectedView; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; +import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.math.statistics.tests.GoodnessOfFitTest; +import de.lmu.ifi.dbs.elki.math.statistics.tests.KolmogorovSmirnovTest; +import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.TopBoundedHeap; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Algorithm to compute High Contrast Subspaces for Density-Based Outlier + * Ranking. + * + * Reference: + *

+ * Fabian Keller, Emmanuel Müller, Klemens Böhm:
+ * HiCS: High Contrast Subspaces for Density-Based Outlier Ranking
+ * in: Proc. IEEE 28th Int. Conf. on Data Engineering (ICDE 2012), Washington, + * DC, USA + *

+ * + * @author Jan Brusis + * @author Erich Schubert + * + * @apiviz.composedOf GoodnessOfFitTest + * @apiviz.composedOf OutlierAlgorithm + * + * @param vector type + */ +@Title("HiCS: High Contrast Subspaces for Density-Based Outlier Ranking") +@Description("Algorithm to compute High Contrast Subspaces in a database as a pre-processing step for for density-based outlier ranking methods.") +@Reference(authors = "Fabian Keller, Emmanuel Müller, Klemens Böhm", title = "HiCS: High Contrast Subspaces for Density-Based Outlier Ranking", booktitle = "Proc. IEEE 28th International Conference on Data Engineering (ICDE 2012)") +public class HiCS> extends AbstractAlgorithm implements OutlierAlgorithm { + /** + * The Logger for this class + */ + private static final Logging logger = Logging.getLogger(HiCS.class); + + /** + * Maximum number of retries. + */ + private static final int MAX_RETRIES = 100; + + /** + * Monte-Carlo iterations + */ + private int m; + + /** + * Alpha threshold + */ + private double alpha; + + /** + * Outlier detection algorithm + */ + private OutlierAlgorithm outlierAlgorithm; + + /** + * Statistical test to use + */ + private GoodnessOfFitTest statTest; + + /** + * Candidates limit + */ + private int cutoff; + + /** + * Random generator + */ + private Random random; + + /** + * Constructor + * + * @param m value of m + * @param alpha value of alpha + * @param outlierAlgorithm Inner outlier detection algorithm + * @param statTest Test to use + * @param cutoff Candidate limit + * @param seed Random seed + */ + public HiCS(int m, double alpha, OutlierAlgorithm outlierAlgorithm, GoodnessOfFitTest statTest, int cutoff, Long seed) { + super(); + this.m = m; + this.alpha = alpha; + this.outlierAlgorithm = outlierAlgorithm; + this.statTest = statTest; + this.cutoff = cutoff; + this.random = (seed != null) ? new Random(seed) : new Random(); + } + + /** + * Perform HiCS on a given database + * + * @param relation the database + * @return The aggregated resulting scores that were assigned by the given + * outlier detection algorithm + */ + public OutlierResult run(Relation relation) { + final DBIDs ids = relation.getDBIDs(); + final V factory = DatabaseUtil.assumeVectorField(relation).getFactory(); + + ArrayList subspaceIndex = buildOneDimIndexes(relation); + Set subspaces = calculateSubspaces(relation, subspaceIndex); + + if(logger.isVerbose()) { + logger.verbose("Number of high-contrast subspaces: " + subspaces.size()); + } + List> results = new ArrayList>(); + FiniteProgress prog = logger.isVerbose() ? new FiniteProgress("Calculating Outlier scores for high Contrast subspaces", subspaces.size(), logger) : null; + + // run outlier detection and collect the result + // TODO extend so that any outlierAlgorithm can be used (use materialized + // relation instead of SubspaceEuclideanDistanceFunction?) + for(HiCSSubspace dimset : subspaces) { + if(logger.isVerbose()) { + logger.verbose("Performing outlier detection in subspace " + dimset); + } + + ProxyDatabase pdb = new ProxyDatabase(ids); + Projection proj = new NumericalFeatureSelection(dimset, factory); + pdb.addRelation(new ProjectedView(relation, proj)); + + // run LOF and collect the result + OutlierResult result = outlierAlgorithm.run(pdb); + results.add(result.getScores()); + if(prog != null) { + prog.incrementProcessed(logger); + } + } + if(prog != null) { + prog.ensureCompleted(logger); + } + + WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); + DoubleMinMax minmax = new DoubleMinMax(); + + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + double sum = 0.0; + for(Relation r : results) { + final Double s = r.get(iditer); + if(s != null && !Double.isNaN(s)) { + sum += s; + } + } + scores.putDouble(iditer, sum); + minmax.put(sum); + } + OutlierScoreMeta meta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax()); + Relation scoreres = new MaterializedRelation("HiCS", "HiCS-outlier", TypeUtil.DOUBLE, scores, relation.getDBIDs()); + + return new OutlierResult(meta, scoreres); + } + + /** + * Calculates "index structures" for every attribute, i.e. sorts a + * ModifiableArray of every DBID in the database for every dimension and + * stores them in a list + * + * @param relation Relation to index + * @return List of sorted objects + */ + private ArrayList buildOneDimIndexes(Relation> relation) { + final int dim = DatabaseUtil.dimensionality(relation); + ArrayList subspaceIndex = new ArrayList(dim + 1); + + SortDBIDsBySingleDimension comp = new VectorUtil.SortDBIDsBySingleDimension(relation); + for(int i = 1; i <= dim; i++) { + ArrayModifiableDBIDs amDBIDs = DBIDUtil.newArray(relation.getDBIDs()); + comp.setDimension(i); + amDBIDs.sort(comp); + subspaceIndex.add(amDBIDs); + } + + return subspaceIndex; + } + + /** + * Identifies high contrast subspaces in a given full-dimensional database + * + * @param relation the relation the HiCS should be evaluated for + * @param subspaceIndex Subspace indexes + * @return a set of high contrast subspaces + */ + private Set calculateSubspaces(Relation> relation, ArrayList subspaceIndex) { + final int dbdim = DatabaseUtil.dimensionality(relation); + + FiniteProgress dprog = logger.isVerbose() ? new FiniteProgress("Subspace dimensionality", dbdim, logger) : null; + if(dprog != null) { + dprog.setProcessed(2, logger); + } + + TreeSet subspaceList = new TreeSet(HiCSSubspace.SORT_BY_SUBSPACE); + TopBoundedHeap dDimensionalList = new TopBoundedHeap(cutoff, HiCSSubspace.SORT_BY_CONTRAST_ASC); + FiniteProgress prog = logger.isVerbose() ? new FiniteProgress("Generating two-element subsets", dbdim * (dbdim - 1) / 2, logger) : null; + // compute two-element sets of subspaces + for(int i = 0; i < dbdim; i++) { + for(int j = i + 1; j < dbdim; j++) { + HiCSSubspace ts = new HiCSSubspace(); + ts.set(i); + ts.set(j); + calculateContrast(relation, ts, subspaceIndex); + dDimensionalList.add(ts); + if(prog != null) { + prog.incrementProcessed(logger); + } + } + } + if(prog != null) { + prog.ensureCompleted(logger); + } + + IndefiniteProgress qprog = logger.isVerbose() ? new IndefiniteProgress("Testing subspace candidates", logger) : null; + for(int d = 3; !dDimensionalList.isEmpty(); d++) { + if(dprog != null) { + dprog.setProcessed(d, logger); + } + subspaceList.addAll(dDimensionalList); + // result now contains all d-dimensional sets of subspaces + + ArrayList candidateList = new ArrayList(dDimensionalList); + dDimensionalList.clear(); + // candidateList now contains the *m* best d-dimensional sets + Collections.sort(candidateList, HiCSSubspace.SORT_BY_SUBSPACE); + + // TODO: optimize APRIORI style, by not even computing the bit set or? + for(int i = 0; i < candidateList.size() - 1; i++) { + for(int j = i + 1; j < candidateList.size(); j++) { + HiCSSubspace set1 = candidateList.get(i); + HiCSSubspace set2 = candidateList.get(j); + + HiCSSubspace joinedSet = new HiCSSubspace(); + joinedSet.or(set1); + joinedSet.or(set2); + if(joinedSet.cardinality() != d) { + continue; + } + + calculateContrast(relation, joinedSet, subspaceIndex); + dDimensionalList.add(joinedSet); + if(qprog != null) { + qprog.incrementProcessed(logger); + } + } + } + // Prune + for(HiCSSubspace cand : candidateList) { + for(HiCSSubspace nextSet : dDimensionalList) { + if(nextSet.contrast > cand.contrast) { + subspaceList.remove(cand); + break; + } + } + } + } + if(qprog != null) { + qprog.setCompleted(logger); + } + if(dprog != null) { + dprog.setProcessed(dbdim, logger); + dprog.ensureCompleted(logger); + } + return subspaceList; + } + + /** + * Calculates the actual contrast of a given subspace + * + * @param relation + * @param subspace + * @param subspaceIndex Subspace indexes + */ + private void calculateContrast(Relation> relation, HiCSSubspace subspace, ArrayList subspaceIndex) { + final int card = subspace.cardinality(); + final double alpha1 = Math.pow(alpha, (1.0 / card)); + final int windowsize = (int) (relation.size() * alpha1); + final FiniteProgress prog = logger.isDebugging() ? new FiniteProgress("Monte-Carlo iterations", m, logger) : null; + + int retries = 0; + double deviationSum = 0.0; + for(int i = 0; i < m; i++) { + // Choose a random set bit. + int chosen = -1; + for(int tmp = random.nextInt(card); tmp >= 0; tmp--) { + chosen = subspace.nextSetBit(chosen + 1); + } + // initialize sample + DBIDs conditionalSample = relation.getDBIDs(); + + for(int j = subspace.nextSetBit(0); j >= 0; j = subspace.nextSetBit(j + 1)) { + if(j == chosen) { + continue; + } + ArrayDBIDs sortedIndices = subspaceIndex.get(j); + ArrayModifiableDBIDs indexBlock = DBIDUtil.newArray(); + // initialize index block + int start = random.nextInt(relation.size() - windowsize); + for(int k = start; k < start + windowsize; k++) { + indexBlock.add(sortedIndices.get(k)); // select index block + } + + conditionalSample = DBIDUtil.intersection(conditionalSample, indexBlock); + } + if(conditionalSample.size() < 10) { + retries++; + if(logger.isDebugging()) { + logger.debug("Sample size very small. Retry no. " + retries); + } + if(retries >= MAX_RETRIES) { + logger.warning("Too many retries, for small samples: " + retries); + } + else { + i--; + continue; + } + } + // Project conditional set + double[] sampleValues = new double[conditionalSample.size()]; + { + int l = 0; + for (DBIDIter iter = conditionalSample.iter(); iter.valid(); iter.advance()) { + sampleValues[l] = relation.get(iter).doubleValue(chosen + 1); + l++; + } + } + // Project full set + double[] fullValues = new double[relation.size()]; + { + int l = 0; + for (DBIDIter iter = subspaceIndex.get(chosen).iter(); iter.valid(); iter.advance()) { + fullValues[l] = relation.get(iter).doubleValue(chosen + 1); + l++; + } + } + double contrast = statTest.deviation(fullValues, sampleValues); + if(Double.isNaN(contrast)) { + i--; + logger.warning("Contrast was NaN"); + continue; + } + deviationSum += contrast; + if(prog != null) { + prog.incrementProcessed(logger); + } + } + if(prog != null) { + prog.ensureCompleted(logger); + } + subspace.contrast = deviationSum / m; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * BitSet that holds a contrast value as field. Used for the representation of + * a subspace in HiCS + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class HiCSSubspace extends BitSet { + /** + * Serial version + */ + private static final long serialVersionUID = 1L; + + /** + * The HiCS contrast value + */ + protected double contrast; + + /** + * Constructor. + */ + public HiCSSubspace() { + super(); + } + + @Override + public String toString() { + StringBuffer buf = new StringBuffer(); + buf.append("[contrast=").append(contrast); + for(int i = nextSetBit(0); i >= 0; i = nextSetBit(i + 1)) { + buf.append(" ").append(i + 1); + } + buf.append("]"); + return buf.toString(); + } + + /** + * Sort subspaces by their actual subspace. + */ + public static Comparator SORT_BY_CONTRAST_ASC = new Comparator() { + @Override + public int compare(HiCSSubspace o1, HiCSSubspace o2) { + if(o1.contrast == o2.contrast) { + return 0; + } + return o1.contrast > o2.contrast ? 1 : -1; + } + }; + + /** + * Sort subspaces by their actual subspace. + */ + public static Comparator SORT_BY_CONTRAST_DESC = new Comparator() { + @Override + public int compare(HiCSSubspace o1, HiCSSubspace o2) { + if(o1.contrast == o2.contrast) { + return 0; + } + return o1.contrast < o2.contrast ? 1 : -1; + } + }; + + /** + * Sort subspaces by their actual subspace. + */ + public static Comparator SORT_BY_SUBSPACE = new Comparator() { + @Override + public int compare(HiCSSubspace o1, HiCSSubspace o2) { + int dim1 = o1.nextSetBit(0); + int dim2 = o2.nextSetBit(0); + while(dim1 >= 0 && dim2 >= 0) { + if(dim1 < dim2) { + return -1; + } + else if(dim1 > dim2) { + return 1; + } + dim1 = o1.nextSetBit(dim1 + 1); + dim2 = o2.nextSetBit(dim2 + 1); + } + return 0; + } + }; + } + + /** + * Parameterization class + * + * @author Jan Brusis + * + * @apiviz.exclude + * + * @param vector type + */ + public static class Parameterizer> extends AbstractParameterizer { + /** + * Parameter that specifies the number of iterations in the Monte-Carlo + * process of identifying high contrast subspaces + */ + public static final OptionID M_ID = OptionID.getOrCreateOptionID("hics.m", "The number of iterations in the Monte-Carlo processing."); + + /** + * Parameter that determines the size of the test statistic during the + * Monte-Carlo iteration + */ + public static final OptionID ALPHA_ID = OptionID.getOrCreateOptionID("hics.alpha", "The discriminance value that determines the size of the test statistic ."); + + /** + * Parameter that specifies which outlier detection algorithm to use on the + * resulting set of high contrast subspaces + */ + public static final OptionID ALGO_ID = OptionID.getOrCreateOptionID("hics.algo", "The Algorithm that performs the actual outlier detection on the resulting set of subspace"); + + /** + * Parameter that specifies which statistical test to use in order to + * calculate the deviation of two given data samples + */ + public static final OptionID TEST_ID = OptionID.getOrCreateOptionID("hics.test", "The statistical test that is used to calculate the deviation of two data samples"); + + /** + * Parameter that specifies the candidate_cutoff + */ + public static final OptionID LIMIT_ID = OptionID.getOrCreateOptionID("hics.limit", "The threshold that determines how many d-dimensional subspace candidates to retain in each step of the generation"); + + /** + * Parameter that specifies the random seed + */ + public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("hics.seed", "The random seed."); + + /** + * Holds the value of {@link #M_ID}. + */ + private int m = 50; + + /** + * Holds the value of {@link #ALPHA_ID}. + */ + private double alpha = 0.1; + + /** + * Holds the value of {@link #ALGO_ID}. + */ + private OutlierAlgorithm outlierAlgorithm; + + /** + * Holds the value of {@link #TEST_ID}. + */ + private GoodnessOfFitTest statTest; + + /** + * Holds the value of {@link #LIMIT_ID} + */ + private int cutoff = 400; + + /** + * Random seed (optional) + */ + private Long seed = null; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + final IntParameter mP = new IntParameter(M_ID, new GreaterConstraint(1), 50); + if(config.grab(mP)) { + m = mP.getValue(); + } + + final DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, new GreaterConstraint(0), 0.1); + if(config.grab(alphaP)) { + alpha = alphaP.getValue(); + } + + final ObjectParameter algoP = new ObjectParameter(ALGO_ID, OutlierAlgorithm.class, LOF.class); + if(config.grab(algoP)) { + outlierAlgorithm = algoP.instantiateClass(config); + } + + final ObjectParameter testP = new ObjectParameter(TEST_ID, GoodnessOfFitTest.class, KolmogorovSmirnovTest.class); + if(config.grab(testP)) { + statTest = testP.instantiateClass(config); + } + + final IntParameter cutoffP = new IntParameter(LIMIT_ID, new GreaterConstraint(1), 100); + if(config.grab(cutoffP)) { + cutoff = cutoffP.getValue(); + } + + final LongParameter seedP = new LongParameter(SEED_ID, true); + if(config.grab(seedP)) { + seed = seedP.getValue(); + } +} + + @Override + protected HiCS makeInstance() { + return new HiCS(m, alpha, outlierAlgorithm, statTest, cutoff, seed); + } + } +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/RescaleMetaOutlierAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/RescaleMetaOutlierAlgorithm.java index 9634cd59..a4db7e3d 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/RescaleMetaOutlierAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/RescaleMetaOutlierAlgorithm.java @@ -34,7 +34,7 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.logging.Logging; @@ -55,6 +55,8 @@ import de.lmu.ifi.dbs.elki.utilities.scaling.outlier.OutlierScalingFunction; * Scale another outlier score using the given scaling function. * * @author Erich Schubert + * + * @apiviz.composedOf OutlierAlgorithm */ public class RescaleMetaOutlierAlgorithm extends AbstractAlgorithm implements OutlierAlgorithm { /** @@ -93,7 +95,7 @@ public class RescaleMetaOutlierAlgorithm extends AbstractAlgorithm, D exte ModifiableDBIDs idview = DBIDUtil.newHashSet(relationx.getDBIDs()); ProxyView proxy = new ProxyView(relationx.getDatabase(), idview, relationx); - double phialpha = NormalDistribution.standardNormalProbit(1.0 - alpha / 2); + double phialpha = NormalDistribution.standardNormalQuantile(1.0 - alpha / 2); // Detect outliers while significant. while(true) { Pair candidate = singleIteration(proxy, relationy); @@ -144,8 +145,8 @@ public class CTLuGLSBackwardSearchAlgorithm, D exte } // Remaining objects are inliers - for(DBID id : idview) { - scores.putDouble(id, 0.0); + for (DBIDIter iter = idview.iter(); iter.valid(); iter.advance()) { + scores.putDouble(iter.getDBID(), 0.0); } } @@ -204,7 +205,7 @@ public class CTLuGLSBackwardSearchAlgorithm, D exte KNNResult neighbors = knnQuery.getKNNForDBID(id, k + 1); ModifiableDBIDs neighborhood = DBIDUtil.newArray(neighbors.size()); for(DistanceResultPair dpair : neighbors) { - if(id.equals(dpair.getDBID())) { + if(id.sameDBID(dpair.getDBID())) { continue; } neighborhood.add(dpair.getDBID()); @@ -213,8 +214,8 @@ public class CTLuGLSBackwardSearchAlgorithm, D exte F.set(i, i, 1.0); final int nweight = -1 / neighborhood.size(); // We need to find the index positions of the neighbors, unfortunately. - for(DBID nid : neighborhood) { - int pos = ids.binarySearch(nid); + for (DBIDIter iter = neighborhood.iter(); iter.valid(); iter.advance()) { + int pos = ids.binarySearch(iter.getDBID()); assert (pos >= 0); F.set(pos, i, nweight); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMeanMultipleAttributes.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMeanMultipleAttributes.java index 68e58ffa..a0c09057 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMeanMultipleAttributes.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMeanMultipleAttributes.java @@ -32,6 +32,7 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -99,7 +100,8 @@ public class CTLuMeanMultipleAttributes> extends CovarianceMatrix covmaker = new CovarianceMatrix(DatabaseUtil.dimensionality(attributes)); WritableDataStore deltas = DataStoreUtil.makeStorage(attributes.getDBIDs(), DataStoreFactory.HINT_TEMP, Vector.class); - for(DBID id : attributes.iterDBIDs()) { + for(DBIDIter iditer = attributes.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); final O obj = attributes.get(id); final DBIDs neighbors = npred.getNeighborDBIDs(id); // TODO: remove object itself from neighbors? @@ -117,7 +119,8 @@ public class CTLuMeanMultipleAttributes> extends DoubleMinMax minmax = new DoubleMinMax(); WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(attributes.getDBIDs(), DataStoreFactory.HINT_STATIC); - for(DBID id : attributes.iterDBIDs()) { + for(DBIDIter iditer = attributes.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); Vector temp = deltas.get(id).minus(mean); final double score = temp.transposeTimesTimes(cmati, temp); minmax.put(score); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMedianAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMedianAlgorithm.java index 9b4534fe..20ab9a00 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMedianAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMedianAlgorithm.java @@ -31,6 +31,7 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -94,18 +95,19 @@ public class CTLuMedianAlgorithm extends AbstractNeighborhoodOutlier { WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); MeanVariance mv = new MeanVariance(); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); DBIDs neighbors = npred.getNeighborDBIDs(id); final double median; { double[] fi = new double[neighbors.size()]; // calculate and store Median of neighborhood int c = 0; - for(DBID n : neighbors) { - if(id.equals(n)) { + for(DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) { + if(id.sameDBID(iter)) { continue; } - fi[c] = relation.get(n).doubleValue(1); + fi[c] = relation.get(iter).doubleValue(1); c++; } @@ -125,7 +127,8 @@ public class CTLuMedianAlgorithm extends AbstractNeighborhoodOutlier { final double mean = mv.getMean(); final double stddev = mv.getNaiveStddev(); DoubleMinMax minmax = new DoubleMinMax(); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); double score = Math.abs((scores.doubleValue(id) - mean) / stddev); minmax.put(score); scores.putDouble(id, score); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMedianMultipleAttributes.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMedianMultipleAttributes.java index cbf61c38..c8bcba74 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMedianMultipleAttributes.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMedianMultipleAttributes.java @@ -32,6 +32,7 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -108,7 +109,8 @@ public class CTLuMedianMultipleAttributes> exten CovarianceMatrix covmaker = new CovarianceMatrix(dim); WritableDataStore deltas = DataStoreUtil.makeStorage(attributes.getDBIDs(), DataStoreFactory.HINT_TEMP, Vector.class); - for(DBID id : attributes.iterDBIDs()) { + for(DBIDIter iditer = attributes.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); final O obj = attributes.get(id); final DBIDs neighbors = npred.getNeighborDBIDs(id); // Compute the median vector @@ -117,9 +119,9 @@ public class CTLuMedianMultipleAttributes> exten double[][] data = new double[dim][neighbors.size()]; int i = 0; // Load data - for(DBID n : neighbors) { + for(DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) { // TODO: skip object itself within neighbors? - O nobj = attributes.get(n); + O nobj = attributes.get(iter); for(int d = 0; d < dim; d++) { data[d][i] = nobj.doubleValue(d + 1); } @@ -143,7 +145,8 @@ public class CTLuMedianMultipleAttributes> exten DoubleMinMax minmax = new DoubleMinMax(); WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(attributes.getDBIDs(), DataStoreFactory.HINT_STATIC); - for(DBID id : attributes.iterDBIDs()) { + for(DBIDIter iditer = attributes.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); Vector temp = deltas.get(id).minus(mean); final double score = temp.transposeTimesTimes(cmati, temp); minmax.put(score); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMoranScatterplotOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMoranScatterplotOutlier.java index 9f19757d..7b88ae66 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMoranScatterplotOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuMoranScatterplotOutlier.java @@ -33,6 +33,7 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.logging.Logging; @@ -98,7 +99,8 @@ public class CTLuMoranScatterplotOutlier extends AbstractNeighborhoodOutlier< // Compute the global mean and variance MeanVariance globalmv = new MeanVariance(); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); globalmv.put(relation.get(id).doubleValue(1)); } @@ -107,12 +109,14 @@ public class CTLuMoranScatterplotOutlier extends AbstractNeighborhoodOutlier< // calculate normalized attribute values // calculate neighborhood average of normalized attribute values. - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); // Compute global z score final double globalZ = (relation.get(id).doubleValue(1) - globalmv.getMean()) / globalmv.getNaiveStddev(); // Compute local average z score Mean localm = new Mean(); - for(DBID n : npred.getNeighborDBIDs(id)) { + for(DBIDIter iter = npred.getNeighborDBIDs(id).iter(); iter.valid(); iter.advance()) { + DBID n = iter.getDBID(); if(id.equals(n)) { continue; } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuRandomWalkEC.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuRandomWalkEC.java index a6425d43..852c4be4 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuRandomWalkEC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuRandomWalkEC.java @@ -34,6 +34,7 @@ import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; @@ -208,7 +209,8 @@ public class CTLuRandomWalkEC> extends Abstrac DBID id = ids.get(i); double gmean = 1.0; int cnt = 0; - for(DBID n : neighbors.get(id)) { + for(DBIDIter iter = neighbors.get(id).iter(); iter.valid(); iter.advance()) { + DBID n = iter.getDBID(); if(id.equals(n)) { continue; } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuScatterplotOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuScatterplotOutlier.java index 8e4ab32c..4f11cb38 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuScatterplotOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuScatterplotOutlier.java @@ -32,6 +32,7 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -102,12 +103,14 @@ public class CTLuScatterplotOutlier extends AbstractNeighborhoodOutlier { // Calculate average of neighborhood for each object and perform a linear // regression using the covariance matrix CovarianceMatrix covm = new CovarianceMatrix(2); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); final double local = relation.get(id).doubleValue(1); // Compute mean of neighbors Mean mean = new Mean(); DBIDs neighbors = npred.getNeighborDBIDs(id); - for(DBID n : neighbors) { + for(DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) { + DBID n = iter.getDBID(); if(id.equals(n)) { continue; } @@ -139,7 +142,8 @@ public class CTLuScatterplotOutlier extends AbstractNeighborhoodOutlier { // calculate mean and variance for error WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); MeanVariance mv = new MeanVariance(); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); // Compute the error from the linear regression double y_i = relation.get(id).doubleValue(1); double e = means.doubleValue(id) - (slope * y_i + inter); @@ -152,7 +156,8 @@ public class CTLuScatterplotOutlier extends AbstractNeighborhoodOutlier { { final double mean = mv.getMean(); final double variance = mv.getNaiveStddev(); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); double score = Math.abs((scores.doubleValue(id) - mean) / variance); minmax.put(score); scores.putDouble(id, score); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuZTestOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuZTestOutlier.java index 573e1526..05729481 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuZTestOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuZTestOutlier.java @@ -33,6 +33,7 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -102,17 +103,17 @@ public class CTLuZTestOutlier extends AbstractNeighborhoodOutlier { WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); MeanVariance zmv = new MeanVariance(); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); DBIDs neighbors = npred.getNeighborDBIDs(id); // Compute Mean of neighborhood Mean localmean = new Mean(); - for(DBID n : neighbors) { + for(DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) { + DBID n = iter.getDBID(); if(id.equals(n)) { continue; } - else { - localmean.put(relation.get(n).doubleValue(1)); - } + localmean.put(relation.get(n).doubleValue(1)); } final double localdiff; if(localmean.getCount() > 0) { @@ -127,7 +128,8 @@ public class CTLuZTestOutlier extends AbstractNeighborhoodOutlier { // Normalize scores using mean and variance DoubleMinMax minmax = new DoubleMinMax(); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); double score = Math.abs(scores.doubleValue(id) - zmv.getMean()) / zmv.getSampleStddev(); minmax.put(score); scores.putDouble(id, score); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/SLOM.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/SLOM.java index e69d46d4..8ae23229 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/SLOM.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/SLOM.java @@ -31,6 +31,7 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; @@ -53,7 +54,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Title; * Reference:
* Sanjay Chawla and Pei Sun
* SLOM: a new measure for local spatial outliers
- * in Knowledge and Information Systems 2005 + * in Knowledge and Information Systems 9(4), 412-429, 2006 *

* * This implementation works around some corner cases in SLOM, in particular @@ -68,7 +69,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Title; */ @Title("SLOM: a new measure for local spatial outliers") @Description("Spatial local outlier measure (SLOM), which captures the local behaviour of datum in their spatial neighbourhood") -@Reference(authors = "Sanjay Chawla and Pei Sun", title = "SLOM: a new measure for local spatial outliers", booktitle = "Knowledge and Information Systems 2005", url = "http://rp-www.cs.usyd.edu.au/~chawlarg/papers/KAIS_online.pdf") +@Reference(authors = "Sanjay Chawla and Pei Sun", title = "SLOM: a new measure for local spatial outliers", booktitle = "Knowledge and Information Systems 9(4), 412-429, 2006", url = "http://dx.doi.org/10.1007/s10115-005-0200-2") public class SLOM> extends AbstractDistanceBasedSpatialOutlier { /** * The logger for this class. @@ -98,13 +99,15 @@ public class SLOM> extends AbstractDistance WritableDoubleDataStore modifiedDistance = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); // calculate D-Tilde - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); double sum = 0; double maxDist = 0; int cnt = 0; final DBIDs neighbors = npred.getNeighborDBIDs(id); - for(DBID neighbor : neighbors) { + for(DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) { + DBID neighbor = iter.getDBID(); if(id.equals(neighbor)) { continue; } @@ -127,12 +130,14 @@ public class SLOM> extends AbstractDistance DoubleMinMax slomminmax = new DoubleMinMax(); WritableDoubleDataStore sloms = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); double sum = 0; int cnt = 0; final DBIDs neighbors = npred.getNeighborDBIDs(id); - for(DBID neighbor : neighbors) { + for(DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) { + DBID neighbor = iter.getDBID(); if(neighbor.equals(id)) { continue; } @@ -146,7 +151,8 @@ public class SLOM> extends AbstractDistance double avg = sum / cnt; double beta = 0; - for(DBID neighbor : neighbors) { + for(DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) { + DBID neighbor = iter.getDBID(); final double dist = modifiedDistance.doubleValue(neighbor); if(dist > avgPlus) { beta += 1; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/SOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/SOF.java index abc3c481..e9987bf0 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/SOF.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/SOF.java @@ -30,6 +30,7 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; @@ -108,11 +109,12 @@ public class SOF> extends AbstractDistanceB DoubleMinMax lofminmax = new DoubleMinMax(); // Compute densities - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); DBIDs neighbors = npred.getNeighborDBIDs(id); double avg = 0; - for(DBID n : neighbors) { - avg += distFunc.distance(id, n).doubleValue(); + for(DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) { + avg += distFunc.distance(id, iter.getDBID()).doubleValue(); } double lrd = 1 / (avg / neighbors.size()); if (Double.isNaN(lrd)) { @@ -122,11 +124,12 @@ public class SOF> extends AbstractDistanceB } // Compute density quotients - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); DBIDs neighbors = npred.getNeighborDBIDs(id); double avg = 0; - for(DBID n : neighbors) { - avg += lrds.doubleValue(n); + for(DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) { + avg += lrds.doubleValue(iter.getDBID()); } final double lrd = (avg / neighbors.size()) / lrds.doubleValue(id); if (!Double.isNaN(lrd)) { diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/TrimmedMeanApproach.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/TrimmedMeanApproach.java index 75700bca..41022414 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/TrimmedMeanApproach.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/TrimmedMeanApproach.java @@ -34,6 +34,7 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -116,13 +117,14 @@ public class TrimmedMeanApproach extends AbstractNeighborhoodOutlier { WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Computing trimmed means", relation.size(), logger) : null; - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); DBIDs neighbors = npred.getNeighborDBIDs(id); int num = 0; double[] values = new double[neighbors.size()]; // calculate trimmedMean - for(DBID n : neighbors) { - values[num] = relation.get(n).doubleValue(1); + for(DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) { + values[num] = relation.get(iter).doubleValue(1); num++; } @@ -161,7 +163,8 @@ public class TrimmedMeanApproach extends AbstractNeighborhoodOutlier { double[] ei = new double[relation.size()]; { int i = 0; - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); ei[i] = errors.doubleValue(id); i++; } @@ -180,7 +183,8 @@ public class TrimmedMeanApproach extends AbstractNeighborhoodOutlier { } // calculate score DoubleMinMax minmax = new DoubleMinMax(); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); double score = Math.abs(errors.doubleValue(id)) * 0.6745 / median_dev_from_median; scores.putDouble(id, score); minmax.put(score); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExtendedNeighborhood.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExtendedNeighborhood.java index 9ee92d35..7a2fda52 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExtendedNeighborhood.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExtendedNeighborhood.java @@ -29,6 +29,7 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; @@ -132,15 +133,17 @@ public class ExtendedNeighborhood extends AbstractPrecomputedNeighborhood { // Expand multiple steps FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Expanding neighborhoods", database.size(), logger) : null; - for(final DBID id : database.iterDBIDs()) { + for(DBIDIter iter = database.iterDBIDs(); iter.valid(); iter.advance()) { + DBID id = iter.getDBID(); HashSetModifiableDBIDs res = DBIDUtil.newHashSet(id); DBIDs todo = id; for(int i = 0; i < steps; i++) { ModifiableDBIDs ntodo = DBIDUtil.newHashSet(); - for(final DBID oid : todo) { - DBIDs add = innerinst.getNeighborDBIDs(oid); + for(DBIDIter iter2 = todo.iter(); iter2.valid(); iter2.advance()) { + DBIDs add = innerinst.getNeighborDBIDs(iter2.getDBID()); if(add != null) { - for(DBID nid : add) { + for(DBIDIter iter3 = add.iter(); iter.valid(); iter.advance()) { + DBID nid = iter3.getDBID(); if(res.contains(nid)) { continue; } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExternalNeighborhood.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExternalNeighborhood.java index f2586e2e..74e5bbcf 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExternalNeighborhood.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExternalNeighborhood.java @@ -42,6 +42,7 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -149,7 +150,8 @@ public class ExternalNeighborhood extends AbstractPrecomputedNeighborhood { { Relation olq = database.getDatabase().getRelation(TypeUtil.LABELLIST); Relation eidq = database.getDatabase().getRelation(TypeUtil.EXTERNALID); - for(DBID id : database.iterDBIDs()) { + for(DBIDIter iditer = database.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); if(eidq != null) { ExternalID eid = eidq.get(id); if(eid != null) { diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/PrecomputedKNearestNeighborNeighborhood.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/PrecomputedKNearestNeighborNeighborhood.java index f5ea7e15..9dd2dee1 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/PrecomputedKNearestNeighborNeighborhood.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/PrecomputedKNearestNeighborNeighborhood.java @@ -30,6 +30,7 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; @@ -119,7 +120,8 @@ public class PrecomputedKNearestNeighborNeighborhood> exte // TODO: use bulk? WritableDataStore s = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, DBIDs.class); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); KNNResult neighbors = knnQuery.getKNNForDBID(id, k); ArrayModifiableDBIDs neighbours = DBIDUtil.newArray(neighbors.size()); for(DistanceResultPair dpair : neighbors) { diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/LinearWeightedExtendedNeighborhood.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/LinearWeightedExtendedNeighborhood.java index 52fc2c46..d170571f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/LinearWeightedExtendedNeighborhood.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/LinearWeightedExtendedNeighborhood.java @@ -30,6 +30,7 @@ import java.util.List; import de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.neighborhood.NeighborSetPredicate; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; @@ -99,8 +100,10 @@ public class LinearWeightedExtendedNeighborhood implements WeightedNeighborSetPr final double weight = computeWeight(i); // Collect newly discovered IDs ModifiableDBIDs add = DBIDUtil.newHashSet(); - for(DBID id : cur) { - for(DBID nid : inner.getNeighborDBIDs(id)) { + for(DBIDIter iter = cur.iter(); iter.valid(); iter.advance()) { + DBID id = iter.getDBID(); + for(DBIDIter iter2 = inner.getNeighborDBIDs(id).iter(); iter2.valid(); iter2.advance()) { + DBID nid = iter2.getDBID(); // Seen before? if(seen.contains(nid)) { continue; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/UnweightedNeighborhoodAdapter.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/UnweightedNeighborhoodAdapter.java index 4378aa2e..ce0666df 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/UnweightedNeighborhoodAdapter.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/UnweightedNeighborhoodAdapter.java @@ -29,6 +29,7 @@ import java.util.Collection; import de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.neighborhood.NeighborSetPredicate; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; @@ -63,7 +64,8 @@ public class UnweightedNeighborhoodAdapter implements WeightedNeighborSetPredica public Collection> getWeightedNeighbors(DBID reference) { DBIDs neighbors = inner.getNeighborDBIDs(reference); ArrayList> adapted = new ArrayList>(neighbors.size()); - for(DBID id : neighbors) { + for(DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) { + DBID id = iter.getDBID(); adapted.add(new DoubleObjPair(1.0, id)); } return adapted; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OUTRES.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OUTRES.java new file mode 100644 index 00000000..573233a7 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OUTRES.java @@ -0,0 +1,428 @@ +package de.lmu.ifi.dbs.elki.algorithm.outlier.subspace; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.QueryUtil; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; +import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; +import de.lmu.ifi.dbs.elki.database.query.DoubleDistanceResultPair; +import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.math.statistics.EpanechnikovKernelDensityFunction; +import de.lmu.ifi.dbs.elki.math.statistics.KernelDensityFunction; +import de.lmu.ifi.dbs.elki.math.statistics.distribution.GammaDistribution; +import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; + +/** + * Adaptive outlierness for subspace outlier ranking (OUTRES). + * + * Note: this algorithm seems to have a O(n^3) complexity without appropriate + * index structures to accelerate range queries: each object in each tested + * subspace will need to know the mean and standard deviation of the density of + * the neighbors, which in turn needs another range query. + * + * Reference: + *

+ * E. Müller, M. Schiffer, T. Seidl
+ * Adaptive outlierness for subspace outlier ranking
+ * in: Proc. 19th ACM International Conference on Information and knowledge + * management + *

+ * + * @author Viktoria Pleintinger + * @author Erich Schubert + * + * @apiviz.composedOf KernelDensityEstimator + * + * @param vector type + */ +@Reference(authors = "E. Müller, M. Schiffer, T. Seidl", title = "Adaptive outlierness for subspace outlier ranking", booktitle = "Proc. 19th ACM International Conference on Information and knowledge management") +public class OUTRES> extends AbstractAlgorithm implements OutlierAlgorithm { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(OUTRES.class); + + /** + * The epsilon (in 2d) parameter + */ + private final double eps; + + /** + * Constant for Kolmogorov-Smirnov at alpha=0.01 (table value) + */ + private static final double K_S_CRITICAL001 = 1.63; + + /** + * Constructor. + * + * @param eps Epsilon + */ + public OUTRES(double eps) { + super(); + this.eps = eps; + } + + /** + * Main loop for OUTRES + * + * @param relation Relation to process + * @return Outlier detection result + */ + public OutlierResult run(Relation relation) { + WritableDoubleDataStore ranks = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); + DoubleMinMax minmax = new DoubleMinMax(); + + KernelDensityEstimator kernel = new KernelDensityEstimator(relation); + BitSet subspace = new BitSet(kernel.dim); + + FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("OutRank scores", relation.size(), logger) : null; + + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + subspace.clear(); + double score = outresScore(0, subspace, iditer, kernel); + ranks.putDouble(iditer, score); + minmax.put(score); + if(progress != null) { + progress.incrementProcessed(logger); + } + } + if(progress != null) { + progress.ensureCompleted(logger); + } + + OutlierScoreMeta meta = new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0., 1., 1.); + OutlierResult outresResult = new OutlierResult(meta, new MaterializedRelation("OUTRES", "outres-score", TypeUtil.DOUBLE, ranks, relation.getDBIDs())); + return outresResult; + } + + /** + * Main loop of OUTRES. Run for each object + * + * @param s start dimension + * @param subspace Current subspace + * @param id Current object ID + * @param kernel Kernel + * @return Score + */ + public double outresScore(final int s, BitSet subspace, DBIDRef id, KernelDensityEstimator kernel) { + double score = 1.0; // Initial score is 1.0 + + for(int i = s; i < kernel.dim; i++) { + if(subspace.get(i)) { // TODO: needed? Or should we always start with i=0? + continue; + } + subspace.set(i); + final SubspaceEuclideanDistanceFunction df = new SubspaceEuclideanDistanceFunction(subspace); + final double adjustedEps = kernel.adjustedEps(kernel.dim); + // Query with a larger window, to also get neighbors of neighbors + // Subspace euclidean is metric! + final DoubleDistance range = new DoubleDistance(adjustedEps * 2); + RangeQuery rq = QueryUtil.getRangeQuery(kernel.relation, df, range); + + List> neighc = rq.getRangeForDBID(id, range); + List neigh = refineRange(neighc, adjustedEps); + if(neigh.size() > 2) { + // Relevance test + if(relevantSubspace(subspace, neigh, kernel)) { + final double density = kernel.subspaceDensity(subspace, neigh); + final double deviation; + // Compute mean and standard deviation for densities of neighbors. + MeanVariance meanv = new MeanVariance(); + for(DoubleDistanceResultPair pair : neigh) { + List n2 = subsetNeighborhoodQuery(neighc, pair.getDBID(), df, adjustedEps, kernel); + meanv.put(kernel.subspaceDensity(subspace, n2)); + } + deviation = (meanv.getMean() - density) / (2. * meanv.getSampleStddev()); + // High deviation: + if(deviation >= 1) { + score *= (density / deviation); + } + // Recursion + score *= outresScore(i + 1, subspace, id, kernel); + } + } + subspace.clear(i); + } + return score; + } + + /** + * Refine a range query. + * + * @param neighc Original result + * @param adjustedEps New epsilon + * @return refined list + */ + private List refineRange(List> neighc, double adjustedEps) { + List n = new ArrayList(neighc.size()); + // We don't have a guarantee for this list to be sorted + for(DistanceResultPair p : neighc) { + if(p instanceof DoubleDistanceResultPair) { + if(((DoubleDistanceResultPair) p).getDoubleDistance() <= adjustedEps) { + n.add((DoubleDistanceResultPair) p); + } + } + else { + double dist = p.getDistance().doubleValue(); + if(dist <= adjustedEps) { + n.add(new DoubleDistanceResultPair(dist, p.getDBID())); + } + } + } + return n; + } + + /** + * Refine neighbors within a subset. + * + * @param neighc Neighbor candidates + * @param dbid Query object + * @param df distance function + * @param adjustedEps Epsilon range + * @param kernel Kernel + * @return Neighbors of neighbor object + */ + private List subsetNeighborhoodQuery(List> neighc, DBID dbid, PrimitiveDoubleDistanceFunction df, double adjustedEps, KernelDensityEstimator kernel) { + List n = new ArrayList(neighc.size()); + V query = kernel.relation.get(dbid); + for(DistanceResultPair p : neighc) { + double dist = df.doubleDistance(query, kernel.relation.get(p)); + if(dist <= adjustedEps) { + n.add(new DoubleDistanceResultPair(dist, p.getDBID())); + } + } + return n; + } + + /** + * Subspace relevance test. + * + * @param subspace Subspace to test + * @param neigh Neighbor list + * @param kernel Kernel density estimator + * @return relevance test result + */ + protected boolean relevantSubspace(BitSet subspace, List neigh, KernelDensityEstimator kernel) { + Relation relation = kernel.relation; + final double crit = K_S_CRITICAL001 / Math.sqrt(neigh.size()); + + for(int dim = subspace.nextSetBit(0); dim > 0; dim = subspace.nextSetBit(dim + 1)) { + // TODO: can we save this copy somehow? + double[] data = new double[neigh.size()]; + { + int count = 0; + for(DoubleDistanceResultPair object : neigh) { + V vector = relation.get(object.getDBID()); + data[count] = vector.doubleValue(dim + 1); + count++; + } + assert (count == neigh.size()); + } + Arrays.sort(data); + + final double norm = data[data.length - 1] - data[0]; + final double min = data[0]; + + // Kolmogorow-Smirnow-Test against uniform distribution: + for(int j = 1; j < data.length - 2; j++) { + double delta = (j / (data.length - 1)) - ((data[j] - min) / norm); + if(Math.abs(delta) > crit) { + return false; + } + } + } + return true; + } + + /** + * Kernel density estimation and utility class. + * + * @author Erich Schubert + */ + protected class KernelDensityEstimator { + /** + * Actual kernel in use + */ + final KernelDensityFunction kernel = EpanechnikovKernelDensityFunction.KERNEL; + + /** + * Relation to retrieve data from + */ + final Relation relation; + + /** + * Epsilon values for different subspace dimensionalities + */ + final double[] epsilons; + + /** + * Optimal bandwidth for a dimensionality of 2 + */ + final double hopttwo; + + /** + * Dimensionality of data set + */ + final int dim; + + /** + * Constructor. + * + * @param relation Relation to apply to + */ + public KernelDensityEstimator(Relation relation) { + super(); + this.relation = relation; + dim = DatabaseUtil.dimensionality(relation); + hopttwo = optimalBandwidth(2); + epsilons = new double[dim + 1]; + Arrays.fill(epsilons, Double.NEGATIVE_INFINITY); + epsilons[2] = OUTRES.this.eps; + } + + /** + * Compute density in the given subspace. + * + * @param subspace Subspace + * @param neighbours Neighbor distance list + * @return Density + */ + protected double subspaceDensity(BitSet subspace, List neighbours) { + final double bandwidth = optimalBandwidth(subspace.cardinality()); + + double density = 0; + for(DoubleDistanceResultPair pair : neighbours) { + double v = pair.getDoubleDistance() / bandwidth; + if(v < 1) { + density += 1 - (v * v); + } + } + + return density / relation.size(); + } + + /** + * Compute optimal kernel bandwidth + * + * @param dim Dimensionality of subspace + * @return optimal bandwidth + */ + protected double optimalBandwidth(int dim) { + // Pi in the publication is redundant and cancels out! + double hopt = 8 * GammaDistribution.gamma(dim / 2.0 + 1) * (dim + 4) * Math.pow(2, dim); + return hopt * Math.pow(relation.size(), (-1 / (dim + 4))); + } + + /** + * Rescale the query radius based on the given dimensionality. + * + * @param dim Dimensionality + * @return Query radius + */ + protected double adjustedEps(int dim) { + // Cached + double e = epsilons[dim]; + if(e < 0) { + e = epsilons[2] * optimalBandwidth(dim) / hopttwo; + epsilons[dim] = e; + } + return e; + } + } + + @Override + protected Logging getLogger() { + return logger; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); + } + + /** + * Parameterization class. + * + * @author Viktoria Pleintinger + * + * @apiviz.exclude + */ + public static class Parameterizer> extends AbstractParameterizer { + /** + * Option ID for Epsilon parameter + */ + public static final OptionID D_ID = OptionID.getOrCreateOptionID("outres.epsilon", "Range value for OUTRES in 2 dimensions."); + + /** + * Query radius + */ + protected double eps; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + final DoubleParameter param = new DoubleParameter(D_ID); + if(config.grab(param)) { + eps = param.getValue(); + } + } + + @Override + protected OUTRES makeInstance() { + return new OUTRES(eps); + } + } +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OutRankS1.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OutRankS1.java new file mode 100644 index 00000000..e370d2bf --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OutRankS1.java @@ -0,0 +1,199 @@ +package de.lmu.ifi.dbs.elki.algorithm.outlier.subspace; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.SubspaceClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.model.SubspaceModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * OutRank: ranking outliers in high dimensional data. + * + * Algorithm to score outliers based on a subspace clustering result. This class + * implements score 1 of the OutRank publication, which is a score based on + * cluster sizes and cluster dimensionality. + * + * Reference: + *

+ * Emmanuel Müller, Ira Assent, Uwe Steinhausen, Thomas Seidl
+ * OutRank: ranking outliers in high dimensional data
+ * In Proceedings 24th International Conference on Data Engineering (ICDE) + * Workshop on Ranking in Databases (DBRank), Cancun, Mexico + *

+ * + * @author Erich Schubert + */ +@Title("OutRank: ranking outliers in high dimensional data") +@Description("Ranking outliers in high dimensional data - score 1") +@Reference(authors = "Emmanuel Müller, Ira Assent, Uwe Steinhausen, Thomas Seidl", title = "OutRank: ranking outliers in high dimensional data", booktitle = "Proc. 24th Int. Conf. on Data Engineering (ICDE) Workshop on Ranking in Databases (DBRank), Cancun, Mexico", url = "http://dx.doi.org/10.1109/ICDEW.2008.4498387") +public class OutRankS1 extends AbstractAlgorithm implements OutlierAlgorithm { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(OutRankS1.class); + + /** + * Clustering algorithm to run. + */ + protected SubspaceClusteringAlgorithm> clusteralg; + + /** + * Weighting parameter of size vs. dimensionality score. + */ + double alpha; + + /** + * Constructor. + * + * @param clusteralg Clustering algorithm to use (must implement + * {@link SubspaceClusteringAlgorithm}!) + * @param alpha Alpha parameter to balance size and dimensionality. + */ + public OutRankS1(SubspaceClusteringAlgorithm> clusteralg, double alpha) { + super(); + this.clusteralg = clusteralg; + this.alpha = alpha; + } + + @Override + public OutlierResult run(Database database) { + DBIDs ids = database.getRelation(TypeUtil.DBID).getDBIDs(); + // Run the primary algorithm + Clustering> clustering = clusteralg.run(database); + + WritableDoubleDataStore score = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT); + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + score.putDouble(iter, 0); + } + + int maxdim = 0, maxsize = 0; + // Find maximum dimensionality and cluster size + for(Cluster> cluster : clustering.getAllClusters()) { + maxsize = Math.max(maxsize, cluster.size()); + maxdim = Math.max(maxdim, cluster.getModel().getDimensions().cardinality()); + } + // Iterate over all clusters: + DoubleMinMax minmax = new DoubleMinMax(); + for(Cluster> cluster : clustering.getAllClusters()) { + double relsize = cluster.size() / (double) maxsize; + double reldim = cluster.getModel().getDimensions().cardinality() / (double) maxdim; + // Process objects in the cluster + for(DBIDIter iter = cluster.getIDs().iter(); iter.valid(); iter.advance()) { + double newscore = score.doubleValue(iter) + alpha * relsize + (1 - alpha) * reldim; + score.putDouble(iter, newscore); + minmax.put(newscore); + } + } + + Relation scoreResult = new MaterializedRelation("OutRank-S1", "OUTRANK_S1", TypeUtil.DOUBLE, score, ids); + OutlierScoreMeta meta = new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0, Double.POSITIVE_INFINITY); + OutlierResult res = new OutlierResult(meta, scoreResult); + res.addChildResult(clustering); + return res; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return clusteralg.getInputTypeRestriction(); + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Clustering algorithm to use. + */ + public static final OptionID ALGORITHM_ID = OptionID.getOrCreateOptionID("outrank.algorithm", "Subspace clustering algorithm to use."); + + /** + * Alpha parameter for S1 + */ + public static final OptionID ALPHA_ID = OptionID.getOrCreateOptionID("outrank.s1.alpha", "Alpha parameter for S1 score."); + + /** + * Clustering algorithm to run. + */ + protected SubspaceClusteringAlgorithm> algorithm = null; + + /** + * Alpha parameter to balance parameters + */ + protected double alpha = 0.25; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + ObjectParameter>> algP = new ObjectParameter>>(ALGORITHM_ID, SubspaceClusteringAlgorithm.class); + if(config.grab(algP)) { + algorithm = algP.instantiateClass(config); + } + DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, new GreaterConstraint(0), 0.25); + if(config.grab(alphaP)) { + alpha = alphaP.getValue(); + } + } + + @Override + protected OutRankS1 makeInstance() { + return new OutRankS1(algorithm, alpha); + } + } +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java new file mode 100644 index 00000000..7fef95e0 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java @@ -0,0 +1,479 @@ +package de.lmu.ifi.dbs.elki.algorithm.outlier.subspace; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ + +import java.util.BitSet; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.distance.similarityfunction.SharedNearestNeighborSimilarityFunction; +import de.lmu.ifi.dbs.elki.distance.similarityfunction.SimilarityFunction; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.result.ResultHierarchy; +import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; +import de.lmu.ifi.dbs.elki.result.textwriter.TextWriteable; +import de.lmu.ifi.dbs.elki.result.textwriter.TextWriterStream; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.Heap; +import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.TiedTopBoundedHeap; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; +import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleObjPair; + +/** + * Subspace Outlier Degree. Outlier detection method for axis-parallel subspaces. + * + * Reference: + *

+ * * H.-P. Kriegel, P. Kröger, E. Schubert, A. Zimek:
+ * Outlier Detection in Axis-Parallel Subspaces of High Dimensional Data
+ * In: Proceedings of the 13th Pacific-Asia Conference on Knowledge Discovery + * and Data Mining (PAKDD), Bangkok, Thailand, 2009 + *

+ * + * @author Arthur Zimek + * + * @apiviz.has SODModel oneway - - computes + * @apiviz.has SharedNearestNeighborSimilarityFunction + * + * @param the type of NumberVector handled by this Algorithm + */ +// todo arthur comment +@Title("SOD: Subspace outlier degree") +@Description("Outlier Detection in Axis-Parallel Subspaces of High Dimensional Data") +@Reference(authors = "H.-P. Kriegel, P. Kröger, E. Schubert, A. Zimek", title = "Outlier Detection in Axis-Parallel Subspaces of High Dimensional Data", booktitle = "Proceedings of the 13th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD), Bangkok, Thailand, 2009", url = "http://dx.doi.org/10.1007/978-3-642-01307-2") +public class SOD, D extends NumberDistance> extends AbstractAlgorithm implements OutlierAlgorithm { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(SOD.class); + + /** + * Parameter to specify the number of shared nearest neighbors to be + * considered for learning the subspace properties., must be an integer + * greater than 0. + */ + public static final OptionID KNN_ID = OptionID.getOrCreateOptionID("sod.knn", "The number of most snn-similar objects to use as reference set for learning the subspace properties."); + + /** + * Parameter to indicate the multiplier for the discriminance value for + * discerning small from large variances. + */ + public static final OptionID ALPHA_ID = OptionID.getOrCreateOptionID("sod.alpha", "The multiplier for the discriminance value for discerning small from large variances."); + + /** + * Parameter for the similarity function. + */ + public static final OptionID SIM_ID = OptionID.getOrCreateOptionID("sod.similarity", "The similarity function used for the neighborhood set."); + + /** + * Holds the value of {@link #KNN_ID}. + */ + private int knn; + + /** + * Holds the value of {@link #ALPHA_ID}. + */ + private double alpha; + + /** + * The similarity function {@link #SIM_ID}. + */ + private SimilarityFunction similarityFunction; + + /** + * Constructor with parameters. + * + * @param knn knn value + * @param alpha Alpha parameter + * @param similarityFunction Shared nearest neighbor similarity function + */ + public SOD(int knn, double alpha, SimilarityFunction similarityFunction) { + super(); + this.knn = knn; + this.alpha = alpha; + this.similarityFunction = similarityFunction; + } + + /** + * Performs the SOD algorithm on the given database. + * + * @param relation Data relation to process + * @return Outlier result + */ + public OutlierResult run(Relation relation) { + SimilarityQuery snnInstance = similarityFunction.instantiate(relation); + FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Assigning Subspace Outlier Degree", relation.size(), logger) : null; + WritableDataStore> sod_models = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, SODModel.class); + DoubleMinMax minmax = new DoubleMinMax(); + for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) { + if(progress != null) { + progress.incrementProcessed(logger); + } + DBIDs knnList = getNearestNeighbors(relation, snnInstance, iter); + SODModel model = new SODModel(relation, knnList, alpha, relation.get(iter)); + sod_models.put(iter, model); + minmax.put(model.getSod()); + } + if(progress != null) { + progress.ensureCompleted(logger); + } + // combine results. + Relation> models = new MaterializedRelation>("Subspace Outlier Model", "sod-outlier", new SimpleTypeInformation>(SODModel.class), sod_models, relation.getDBIDs()); + OutlierScoreMeta meta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax()); + OutlierResult sodResult = new OutlierResult(meta, new SODProxyScoreResult(models, relation.getDBIDs())); + // also add the models. + sodResult.addChildResult(models); + return sodResult; + } + + /** + * Provides the k nearest neighbors in terms of the shared nearest neighbor + * distance. + *

+ * The query object is excluded from the knn list. + * + * @param relation the database holding the objects + * @param simQ similarity function + * @param queryObject the query object for which the kNNs should be determined + * @return the k nearest neighbors in terms of the shared nearest neighbor + * distance without the query object + */ + private DBIDs getNearestNeighbors(Relation relation, SimilarityQuery simQ, DBIDRef queryObject) { + // similarityFunction.getPreprocessor().getParameters(); + Heap> nearestNeighbors = new TiedTopBoundedHeap>(knn); + for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) { + if(!iter.sameDBID(queryObject)) { + double sim = simQ.similarity(queryObject, iter).doubleValue(); + if(sim > 0) { + nearestNeighbors.add(new DoubleObjPair(sim, iter.getDBID())); + } + } + } + // Collect DBIDs + ArrayModifiableDBIDs dbids = DBIDUtil.newArray(nearestNeighbors.size()); + while(nearestNeighbors.size() > 0) { + final DoubleObjPair next = nearestNeighbors.poll(); + dbids.add(next.second); + } + return dbids; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * + * + * @author Arthur Zimek + * @param the type of DatabaseObjects handled by this Result + */ + // TODO: arthur comment + public static class SODModel> implements TextWriteable, Comparable> { + private double[] centerValues; + + private V center; + + private double[] variances; + + private double expectationOfVariance; + + private BitSet weightVector; + + private double sod; + + /** + * Initialize SOD Model + * + * @param relation Database + * @param neighborhood Neighborhood + * @param alpha Alpha value + * @param queryObject Query object + */ + public SODModel(Relation relation, DBIDs neighborhood, double alpha, V queryObject) { + if(neighborhood.size() > 0) { + // TODO: store database link? + centerValues = new double[DatabaseUtil.dimensionality(relation)]; + variances = new double[centerValues.length]; + for(DBIDIter iter = neighborhood.iter(); iter.valid(); iter.advance()) { + V databaseObject = relation.get(iter); + for(int d = 0; d < centerValues.length; d++) { + centerValues[d] += databaseObject.doubleValue(d + 1); + } + } + for(int d = 0; d < centerValues.length; d++) { + centerValues[d] /= neighborhood.size(); + } + for(DBIDIter iter = neighborhood.iter(); iter.valid(); iter.advance()) { + V databaseObject = relation.get(iter); + for(int d = 0; d < centerValues.length; d++) { + // distance + double distance = centerValues[d] - databaseObject.doubleValue(d + 1); + // variance + variances[d] += distance * distance; + } + } + expectationOfVariance = 0; + for(int d = 0; d < variances.length; d++) { + variances[d] /= neighborhood.size(); + expectationOfVariance += variances[d]; + } + expectationOfVariance /= variances.length; + weightVector = new BitSet(variances.length); + for(int d = 0; d < variances.length; d++) { + if(variances[d] < alpha * expectationOfVariance) { + weightVector.set(d, true); + } + } + center = DatabaseUtil.assumeVectorField(relation).getFactory().newNumberVector(centerValues); + sod = subspaceOutlierDegree(queryObject, center, weightVector); + } + else { + center = queryObject; + sod = 0.0; + } + } + + /** + * Compute SOD score + * + * @param queryObject + * @param center + * @param weightVector + * @return sod value + */ + private double subspaceOutlierDegree(V queryObject, V center, BitSet weightVector) { + final SubspaceEuclideanDistanceFunction df = new SubspaceEuclideanDistanceFunction(weightVector); + final int card = weightVector.cardinality(); + if(card == 0) { + return 0; + } + double distance = df.distance(queryObject, center).doubleValue(); + distance /= card; + return distance; + } + + /** + * Return the SOD of the point. + * + * @return sod value + */ + public double getSod() { + return this.sod; + } + + @Override + public void writeToText(TextWriterStream out, String label) { + out.inlinePrint(label + "=" + this.sod); + out.commentPrintLn(this.getClass().getSimpleName() + ":"); + out.commentPrintLn("relevant attributes (counting starts with 0): " + this.weightVector.toString()); + out.commentPrintLn("center of neighborhood: " + out.normalizationRestore(center).toString()); + out.commentPrintLn("subspace outlier degree: " + this.sod); + out.commentPrintSeparator(); + } + + @Override + public int compareTo(SODModel o) { + return Double.compare(this.getSod(), o.getSod()); + } + + } + + /** + * Proxy class that converts a model result to an actual SOD score result. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + protected static class SODProxyScoreResult implements Relation { + /** + * Model result this is a proxy for. + */ + Relation> models; + + /** + * The IDs we are defined for + */ + DBIDs dbids; + + /** + * Constructor. + * + * @param models Models result + * @param dbids IDs we are defined for + */ + public SODProxyScoreResult(Relation> models, DBIDs dbids) { + super(); + this.models = models; + this.dbids = dbids; + } + + @Override + public Double get(DBIDRef objID) { + return models.get(objID).getSod(); + } + + @Override + public String getLongName() { + return "Subspace Outlier Degree"; + } + + @Override + public String getShortName() { + return "sod-outlier"; + } + + @Override + public DBIDs getDBIDs() { + return dbids; + } + + @Override + public DBIDIter iterDBIDs() { + return dbids.iter(); + } + + @Override + public Database getDatabase() { + return null; // FIXME + } + + @Override + public void set(DBIDRef id, Double val) { + throw new UnsupportedOperationException(); + } + + @Override + public void delete(DBIDRef id) { + throw new UnsupportedOperationException(); + } + + @Override + public SimpleTypeInformation getDataTypeInformation() { + return TypeUtil.DOUBLE; + } + + @Override + public int size() { + return dbids.size(); + } + + @Override + public ResultHierarchy getHierarchy() { + return models.getHierarchy(); + } + + @Override + public void setHierarchy(ResultHierarchy hierarchy) { + models.setHierarchy(hierarchy); + } + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer, D extends NumberDistance> extends AbstractParameterizer { + /** + * Holds the value of {@link #KNN_ID}. + */ + private int knn = 1; + + /** + * Holds the value of {@link #ALPHA_ID}. + */ + private double alpha = 1.1; + + /** + * The similarity function - {@link #SIM_ID}. + */ + private SimilarityFunction similarityFunction; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + final ObjectParameter> simP = new ObjectParameter>(SIM_ID, SimilarityFunction.class, SharedNearestNeighborSimilarityFunction.class); + if(config.grab(simP)) { + similarityFunction = simP.instantiateClass(config); + } + + final IntParameter knnP = new IntParameter(KNN_ID, new GreaterConstraint(0)); + if(config.grab(knnP)) { + knn = knnP.getValue(); + } + + final DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, new GreaterConstraint(0), 1.1); + if(config.grab(alphaP)) { + alpha = alphaP.getValue(); + } + } + + @Override + protected SOD makeInstance() { + return new SOD(knn, alpha, similarityFunction); + } + } +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/package-info.java new file mode 100644 index 00000000..8b1c80df --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/package-info.java @@ -0,0 +1,28 @@ +/** + *

Subspace outlier detection methods.

+ * + * Methods that detect outliers in subspaces (projections) of the data set. + */ +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package de.lmu.ifi.dbs.elki.algorithm.outlier.subspace; \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/ByLabelOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/ByLabelOutlier.java index 86730404..66a89cf5 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/ByLabelOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/ByLabelOutlier.java @@ -35,7 +35,7 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.logging.Logging; @@ -112,15 +112,10 @@ public class ByLabelOutlier extends AbstractAlgorithm implements */ public OutlierResult run(Relation relation) { WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT); - for(DBID id : relation.iterDBIDs()) { - String label = relation.get(id).toString(); - final double score; - if (pattern.matcher(label).matches()) { - score = 1.0; - } else { - score = 0.0; - } - scores.putDouble(id, score); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + String label = relation.get(iditer).toString(); + final double score = (pattern.matcher(label).matches()) ? 1 : 0; + scores.putDouble(iditer, score); } Relation scoreres = new MaterializedRelation("By label outlier scores", "label-outlier", TypeUtil.DOUBLE, scores, relation.getDBIDs()); OutlierScoreMeta meta = new ProbabilisticOutlierScore(); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialAllOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialAllOutlier.java index 509e35e9..b50226f1 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialAllOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialAllOutlier.java @@ -30,7 +30,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.logging.Logging; @@ -70,8 +70,8 @@ public class TrivialAllOutlier extends AbstractAlgorithm implemen */ public OutlierResult run(Relation relation) { WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT); - for(DBID id : relation.iterDBIDs()) { - scores.putDouble(id, 1.0); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + scores.putDouble(iditer, 1.0); } Relation scoreres = new MaterializedRelation("Trivial all-outlier score", "all-outlier", TypeUtil.DOUBLE, scores, relation.getDBIDs()); OutlierScoreMeta meta = new ProbabilisticOutlierScore(); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialGeneratedOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialGeneratedOutlier.java index db40ff30..d1c2e076 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialGeneratedOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialGeneratedOutlier.java @@ -37,7 +37,7 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.logging.Logging; @@ -100,7 +100,7 @@ public class TrivialGeneratedOutlier extends AbstractAlgorithm im } @Override - public OutlierResult run(Database database) throws IllegalStateException { + public OutlierResult run(Database database) { Relation> vecs = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD); Relation models = database.getRelation(new SimpleTypeInformation(Model.class)); // Prefer a true class label @@ -129,8 +129,8 @@ public class TrivialGeneratedOutlier extends AbstractAlgorithm im final double minscore = expect / (expect + 1); HashSet generators = new HashSet(); - for(DBID id : models.iterDBIDs()) { - Model model = models.get(id); + for(DBIDIter iditer = models.iterDBIDs(); iditer.valid(); iditer.advance()) { + Model model = models.get(iditer); if(model instanceof GeneratorSingleCluster) { generators.add((GeneratorSingleCluster) model); } @@ -139,10 +139,10 @@ public class TrivialGeneratedOutlier extends AbstractAlgorithm im logger.warning("No generator models found for dataset - all points will be considered outliers."); } - for(DBID id : models.iterDBIDs()) { + for(DBIDIter iditer = models.iterDBIDs(); iditer.valid(); iditer.advance()) { double score = 0.0; // Convert to a math vector - Vector v = vecs.get(id).getColumnVector(); + Vector v = vecs.get(iditer).getColumnVector(); for(GeneratorSingleCluster gen : generators) { Vector tv = v; // Transform backwards @@ -170,7 +170,7 @@ public class TrivialGeneratedOutlier extends AbstractAlgorithm im score = expect / (expect + score); // adjust to 0 to 1 range: score = (score - minscore) / (1 - minscore); - scores.putDouble(id, score); + scores.putDouble(iditer, score); } Relation scoreres = new MaterializedRelation("Model outlier scores", "model-outlier", TypeUtil.DOUBLE, scores, models.getDBIDs()); OutlierScoreMeta meta = new ProbabilisticOutlierScore(0., 1.); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialNoOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialNoOutlier.java index cff2ad2c..6d8e9f46 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialNoOutlier.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialNoOutlier.java @@ -30,7 +30,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.logging.Logging; @@ -68,10 +68,10 @@ public class TrivialNoOutlier extends AbstractAlgorithm implement * @param relation Relation * @return Result */ - public OutlierResult run(Relation relation) throws IllegalStateException { + public OutlierResult run(Relation relation) { WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT); - for(DBID id : relation.iterDBIDs()) { - scores.putDouble(id, 0.0); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + scores.putDouble(iditer, 0.0); } Relation scoreres = new MaterializedRelation("Trivial no-outlier score", "no-outlier", TypeUtil.DOUBLE, scores, relation.getDBIDs()); OutlierScoreMeta meta = new ProbabilisticOutlierScore(); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AddSingleScale.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AddSingleScale.java new file mode 100644 index 00000000..481261b3 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AddSingleScale.java @@ -0,0 +1,97 @@ +package de.lmu.ifi.dbs.elki.algorithm.statistics; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +import de.lmu.ifi.dbs.elki.algorithm.Algorithm; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.math.scales.LinearScale; +import de.lmu.ifi.dbs.elki.result.Result; +import de.lmu.ifi.dbs.elki.result.ResultUtil; +import de.lmu.ifi.dbs.elki.result.ScalesResult; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; + +/** + * Pseudo "algorith" that computes the global min/max for a relation across all + * attributes. + * + * @author Erich Schubert + */ +@Description("Setup a scaling so that all dimensions are scaled equally in visualization.") +public class AddSingleScale implements Algorithm { + /** + * Constructor. + */ + public AddSingleScale() { + super(); + } + + @SuppressWarnings("unchecked") + @Override + public Result run(Database database) { + for(Relation rel : database.getRelations()) { + if(TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(rel.getDataTypeInformation())) { + ScalesResult res = run((Relation>) rel); + ResultUtil.addChildResult(rel, res); + } + } + return null; + } + + /** + * Add scales to a single vector relation. + * + * @param rel Relation + * @return Scales + */ + private ScalesResult run(Relation> rel) { + final int dim = DatabaseUtil.dimensionality(rel); + DoubleMinMax minmax = new DoubleMinMax(); + for(DBIDIter iditer = rel.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); + NumberVector vec = rel.get(id); + for(int d = 1; d <= dim; d++) { + minmax.put(vec.doubleValue(d)); + } + } + LinearScale scale = new LinearScale(minmax.getMin(), minmax.getMax()); + LinearScale[] scales = new LinearScale[dim]; + for(int i = 0; i < dim; i++) { + scales[i] = scale; + } + ScalesResult res = new ScalesResult(scales); + return res; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); + } +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AveragePrecisionAtK.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AveragePrecisionAtK.java index 1c74621b..f6f1d16f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AveragePrecisionAtK.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AveragePrecisionAtK.java @@ -34,6 +34,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; @@ -101,7 +102,7 @@ public class AveragePrecisionAtK run(Database database) throws IllegalStateException { + public HistogramResult run(Database database) { final Relation relation = database.getRelation(getInputTypeRestriction()[0]); final Relation lrelation = database.getRelation(getInputTypeRestriction()[1]); final DistanceQuery distQuery = database.getDistanceQuery(relation, getDistanceFunction()); @@ -122,7 +123,8 @@ public class AveragePrecisionAtK knn = knnQuery.getKNNForDBID(id, k); Object label = lrelation.get(id); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java index 78bbf5f4..d6ce6a15 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java @@ -31,7 +31,7 @@ import java.util.Random; import java.util.TreeSet; import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelClustering; +import de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.DoubleVector; import de.lmu.ifi.dbs.elki.data.model.Model; @@ -40,6 +40,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; @@ -66,7 +67,6 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Parameter; import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleObjPair; -import de.lmu.ifi.dbs.elki.utilities.pairs.FCPair; import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; /** @@ -131,11 +131,8 @@ public class DistanceStatisticsWithClasses> ex this.sampling = sampling; } - /** - * Iterates over all points in the database. - */ @Override - public HistogramResult run(Database database) throws IllegalStateException { + public HistogramResult run(Database database) { final Relation relation = database.getRelation(getInputTypeRestriction()[0]); final DistanceQuery distFunc = database.getDistanceQuery(relation, getDistanceFunction()); @@ -145,7 +142,7 @@ public class DistanceStatisticsWithClasses> ex DoubleMinMax gminmax = new DoubleMinMax(); // Cluster by labels - Collection> split = (new ByLabelClustering()).run(database).getAllClusters(); + Collection> split = (new ByLabelOrAllInOneClustering()).run(database).getAllClusters(); // global in-cluster min/max DoubleMinMax giminmax = new DoubleMinMax(); @@ -184,12 +181,14 @@ public class DistanceStatisticsWithClasses> ex final Pair incFirst = new Pair(1L, 0L); final Pair incSecond = new Pair(0L, 1L); for(Cluster c1 : split) { - for(DBID id1 : c1.getIDs()) { + for(DBIDIter iter = c1.getIDs().iter(); iter.valid(); iter.advance()) { + DBID id1 = iter.getDBID(); // in-cluster distances DoubleMinMax iminmax = new DoubleMinMax(); - for(DBID id2 : c1.getIDs()) { + for(DBIDIter iter2 = c1.getIDs().iter(); iter2.valid(); iter2.advance()) { + DBID id2 = iter2.getDBID(); // skip the point itself. - if(id1 == id2) { + if(id1.sameDBID(id2)) { continue; } double d = distFunc.distance(id1, id2).doubleValue(); @@ -212,9 +211,10 @@ public class DistanceStatisticsWithClasses> ex if(c2 == c1) { continue; } - for(DBID id2 : c2.getIDs()) { + for(DBIDIter iter2 = c2.getIDs().iter(); iter2.valid(); iter2.advance()) { + DBID id2 = iter2.getDBID(); // skip the point itself (shouldn't happen though) - if(id1 == id2) { + if(id1.sameDBID(id2)) { continue; } double d = distFunc.distance(id1, id2).doubleValue(); @@ -255,8 +255,6 @@ public class DistanceStatisticsWithClasses> ex onum += ppair.getSecond().getSecond(); } long bnum = inum + onum; - // Note: when full sampling is added, this assertion won't hold anymore. - assert (bnum == relation.size() * (relation.size() - 1)); Collection binstat = new ArrayList(numbin); for(DoubleObjPair> ppair : histogram) { @@ -285,58 +283,62 @@ public class DistanceStatisticsWithClasses> ex Random rnd = new Random(); // estimate minimum and maximum. int k = (int) Math.max(25, Math.pow(database.size(), 0.2)); - TreeSet> minhotset = new TreeSet>(); - TreeSet> maxhotset = new TreeSet>(Collections.reverseOrder()); + TreeSet> minhotset = new TreeSet>(); + TreeSet> maxhotset = new TreeSet>(Collections.reverseOrder()); int randomsize = (int) Math.max(25, Math.pow(database.size(), 0.2)); double rprob = ((double) randomsize) / size; ArrayModifiableDBIDs randomset = DBIDUtil.newArray(randomsize); - Iterator iter = database.iterDBIDs(); - if(!iter.hasNext()) { + DBIDIter iter = database.iterDBIDs(); + if(!iter.valid()) { throw new IllegalStateException(ExceptionMessages.DATABASE_EMPTY); } - DBID firstid = iter.next(); - minhotset.add(new FCPair(Double.MAX_VALUE, firstid)); - maxhotset.add(new FCPair(Double.MIN_VALUE, firstid)); - while(iter.hasNext()) { - DBID id1 = iter.next(); + DBID firstid = iter.getDBID(); + iter.advance(); + minhotset.add(new DoubleObjPair(Double.MAX_VALUE, firstid)); + maxhotset.add(new DoubleObjPair(Double.MIN_VALUE, firstid)); + while(iter.valid()) { + DBID id1 = iter.getDBID(); + iter.advance(); // generate candidates for min distance. - ArrayList> np = new ArrayList>(k * 2 + randomsize * 2); - for(FCPair pair : minhotset) { + ArrayList> np = new ArrayList>(k * 2 + randomsize * 2); + for(DoubleObjPair pair : minhotset) { DBID id2 = pair.getSecond(); // skip the object itself if(id1.compareTo(id2) == 0) { continue; } double d = distFunc.distance(id1, id2).doubleValue(); - np.add(new FCPair(d, id1)); - np.add(new FCPair(d, id2)); + np.add(new DoubleObjPair(d, id1)); + np.add(new DoubleObjPair(d, id2)); } - for(DBID id2 : randomset) { + for(DBIDIter iter2 = randomset.iter(); iter2.valid(); iter2.advance()) { + DBID id2 = iter2.getDBID(); double d = distFunc.distance(id1, id2).doubleValue(); - np.add(new FCPair(d, id1)); - np.add(new FCPair(d, id2)); + np.add(new DoubleObjPair(d, id1)); + np.add(new DoubleObjPair(d, id2)); } minhotset.addAll(np); shrinkHeap(minhotset, k); // generate candidates for max distance. - ArrayList> np2 = new ArrayList>(k * 2 + randomsize * 2); - for(FCPair pair : minhotset) { + ArrayList> np2 = new ArrayList>(k * 2 + randomsize * 2); + for(DoubleObjPair pair : minhotset) { DBID id2 = pair.getSecond(); // skip the object itself if(id1.compareTo(id2) == 0) { continue; } double d = distFunc.distance(id1, id2).doubleValue(); - np2.add(new FCPair(d, id1)); - np2.add(new FCPair(d, id2)); + np2.add(new DoubleObjPair(d, id1)); + np2.add(new DoubleObjPair(d, id2)); } - for(DBID id2 : randomset) { + for(DBIDIter iter2 = randomset.iter(); iter2.valid(); iter2.advance()) { + DBID id2 = iter2.getDBID(); double d = distFunc.distance(id1, id2).doubleValue(); - np.add(new FCPair(d, id1)); - np.add(new FCPair(d, id2)); + np.add(new DoubleObjPair(d, id1)); + np.add(new DoubleObjPair(d, id2)); } maxhotset.addAll(np2); shrinkHeap(maxhotset, k); @@ -349,14 +351,16 @@ public class DistanceStatisticsWithClasses> ex randomset.set((int) Math.floor(rnd.nextDouble() * randomsize), id1); } } - return new DoubleMinMax(minhotset.first().getFirst(), maxhotset.first().getFirst()); + return new DoubleMinMax(minhotset.first().first, maxhotset.first().first); } private DoubleMinMax exactMinMax(Relation database, DistanceQuery distFunc) { DoubleMinMax minmax = new DoubleMinMax(); // find exact minimum and maximum first. - for(DBID id1 : database.iterDBIDs()) { - for(DBID id2 : database.iterDBIDs()) { + for(DBIDIter iditer = database.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id1 = iditer.getDBID(); + for(DBIDIter iditer2 = database.iterDBIDs(); iditer2.valid(); iditer2.advance()) { + DBID id2 = iditer2.getDBID(); // skip the point itself. if(id1.compareTo(id2) == 0) { continue; @@ -368,12 +372,12 @@ public class DistanceStatisticsWithClasses> ex return minmax; } - private void shrinkHeap(TreeSet> hotset, int k) { + private void shrinkHeap(TreeSet> hotset, int k) { // drop duplicates ModifiableDBIDs seenids = DBIDUtil.newHashSet(2 * k); int cnt = 0; - for(Iterator> i = hotset.iterator(); i.hasNext();) { - FCPair p = i.next(); + for(Iterator> i = hotset.iterator(); i.hasNext();) { + DoubleObjPair p = i.next(); if(cnt > k || seenids.contains(p.getSecond())) { i.remove(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java index c1eb118d..353c1b02 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java @@ -29,7 +29,7 @@ import java.util.Collections; import java.util.HashMap; import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelClustering; +import de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.DoubleVector; import de.lmu.ifi.dbs.elki.data.NumberVector; @@ -39,6 +39,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult; @@ -114,11 +115,8 @@ public class EvaluateRankingQuality, D extends Numb */ int numbins = 20; - /** - * Run the algorithm. - */ @Override - public HistogramResult run(Database database) throws IllegalStateException { + public HistogramResult run(Database database) { final Relation relation = database.getRelation(getInputTypeRestriction()[0]); final DistanceQuery distQuery = database.getDistanceQuery(relation, getDistanceFunction()); final KNNQuery knnQuery = database.getKNNQuery(distQuery, relation.size()); @@ -127,7 +125,7 @@ public class EvaluateRankingQuality, D extends Numb logger.verbose("Preprocessing clusters..."); } // Cluster by labels - Collection> split = (new ByLabelClustering()).run(database).getAllClusters(); + Collection> split = (new ByLabelOrAllInOneClustering()).run(database).getAllClusters(); // Compute cluster averages and covariance matrix HashMap, V> averages = new HashMap, V>(split.size()); @@ -150,7 +148,8 @@ public class EvaluateRankingQuality, D extends Numb Vector av = averages.get(clus).getColumnVector(); Matrix covm = covmats.get(clus); - for(DBID i1 : clus.getIDs()) { + for(DBIDIter iter = clus.getIDs().iter(); iter.valid(); iter.advance()) { + DBID i1 = iter.getDBID(); Double d = MathUtil.mahalanobisDistance(covm, av.minus(relation.get(i1).getColumnVector())); cmem.add(new FCPair(d, i1)); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java index 6d64dc55..4305bbca 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java @@ -27,7 +27,7 @@ import java.util.ArrayList; import java.util.Collection; import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelClustering; +import de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.DoubleVector; import de.lmu.ifi.dbs.elki.data.model.Model; @@ -35,6 +35,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult; @@ -107,7 +108,7 @@ public class RankingQualityHistogram> extends logger.verbose("Preprocessing clusters..."); } // Cluster by labels - Collection> split = (new ByLabelClustering()).run(database).getAllClusters(); + Collection> split = (new ByLabelOrAllInOneClustering()).run(database).getAllClusters(); AggregatingHistogram hist = AggregatingHistogram.DoubleSumHistogram(numbins, 0.0, 1.0); @@ -119,7 +120,8 @@ public class RankingQualityHistogram> extends MeanVariance mv = new MeanVariance(); // sort neighbors for(Cluster clus : split) { - for(DBID i1 : clus.getIDs()) { + for(DBIDIter iter = clus.getIDs().iter(); iter.valid(); iter.advance()) { + DBID i1 = iter.getDBID(); KNNResult knn = knnQuery.getKNNForDBID(i1, relation.size()); double result = ROC.computeROCAUCDistanceResult(relation.size(), clus, knn); diff --git a/src/de/lmu/ifi/dbs/elki/application/GeneratorXMLSpec.java b/src/de/lmu/ifi/dbs/elki/application/GeneratorXMLSpec.java index ccee6d39..19207c5c 100644 --- a/src/de/lmu/ifi/dbs/elki/application/GeneratorXMLSpec.java +++ b/src/de/lmu/ifi/dbs/elki/application/GeneratorXMLSpec.java @@ -44,7 +44,7 @@ import de.lmu.ifi.dbs.elki.datasource.GeneratorXMLDatabaseConnection; import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; -import de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution; +import de.lmu.ifi.dbs.elki.math.statistics.distribution.DistributionWithRandom; import de.lmu.ifi.dbs.elki.utilities.FormatUtil; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.exceptions.UnableToComplyException; @@ -186,7 +186,7 @@ public class GeneratorXMLSpec extends AbstractApplication { outStream.write("## Density correction factor: " + cursclus.getDensityCorrection() + LINE_SEPARATOR); outStream.write("## Generators:" + LINE_SEPARATOR); for(int i = 0; i < cursclus.getDim(); i++) { - Distribution gen = cursclus.getDistribution(i); + DistributionWithRandom gen = cursclus.getDistribution(i); outStream.write("## " + gen.toString() + LINE_SEPARATOR); } if(cursclus.getTransformation() != null && cursclus.getTransformation().getTransformation() != null) { diff --git a/src/de/lmu/ifi/dbs/elki/application/cache/CacheDoubleDistanceInOnDiskMatrix.java b/src/de/lmu/ifi/dbs/elki/application/cache/CacheDoubleDistanceInOnDiskMatrix.java index c2e575be..19a22699 100644 --- a/src/de/lmu/ifi/dbs/elki/application/cache/CacheDoubleDistanceInOnDiskMatrix.java +++ b/src/de/lmu/ifi/dbs/elki/application/cache/CacheDoubleDistanceInOnDiskMatrix.java @@ -30,6 +30,7 @@ import de.lmu.ifi.dbs.elki.application.AbstractApplication; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.StaticArrayDatabase; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; @@ -119,7 +120,8 @@ public class CacheDoubleDistanceInOnDiskMatrix DistanceQuery distanceQuery = database.getDistanceQuery(relation, distance); int matrixsize = 0; - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); matrixsize = Math.max(matrixsize, id.getIntegerID() + 1); if(id.getIntegerID() < 0) { throw new AbortException("OnDiskMatrixCache does not allow negative DBIDs."); @@ -134,8 +136,10 @@ public class CacheDoubleDistanceInOnDiskMatrix throw new AbortException("Error creating output matrix.", e); } - for(DBID id1 : relation.iterDBIDs()) { - for(DBID id2 : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id1 = iditer.getDBID(); + for(DBIDIter iditer2 = relation.iterDBIDs(); iditer2.valid(); iditer2.advance()) { + DBID id2 = iditer2.getDBID(); if(id2.getIntegerID() >= id1.getIntegerID()) { double d = distanceQuery.distance(id1, id2).doubleValue(); if(debugExtraCheckSymmetry) { diff --git a/src/de/lmu/ifi/dbs/elki/application/cache/CacheFloatDistanceInOnDiskMatrix.java b/src/de/lmu/ifi/dbs/elki/application/cache/CacheFloatDistanceInOnDiskMatrix.java index 237aedb9..78a64442 100644 --- a/src/de/lmu/ifi/dbs/elki/application/cache/CacheFloatDistanceInOnDiskMatrix.java +++ b/src/de/lmu/ifi/dbs/elki/application/cache/CacheFloatDistanceInOnDiskMatrix.java @@ -30,6 +30,7 @@ import de.lmu.ifi.dbs.elki.application.AbstractApplication; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.StaticArrayDatabase; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; @@ -124,7 +125,8 @@ public class CacheFloatDistanceInOnDiskMatrix> DistanceQuery distanceQuery = database.getDistanceQuery(relation, distance); int matrixsize = 0; - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); matrixsize = Math.max(matrixsize, id.getIntegerID() + 1); if(id.getIntegerID() < 0) { throw new AbortException("OnDiskMatrixCache does not allow negative DBIDs."); @@ -139,8 +141,10 @@ public class CacheFloatDistanceInOnDiskMatrix> throw new AbortException("Error creating output matrix.", e); } - for(DBID id1 : relation.iterDBIDs()) { - for(DBID id2 : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id1 = iditer.getDBID(); + for(DBIDIter iditer2 = relation.iterDBIDs(); iditer2.valid(); iditer2.advance()) { + DBID id2 = iditer2.getDBID(); if(id2.getIntegerID() >= id1.getIntegerID()) { float d = distanceQuery.distance(id1, id2).floatValue(); if(debugExtraCheckSymmetry) { diff --git a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/ComputeKNNOutlierScores.java b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/ComputeKNNOutlierScores.java index f6506fcc..de1ddbec 100644 --- a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/ComputeKNNOutlierScores.java +++ b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/ComputeKNNOutlierScores.java @@ -44,6 +44,7 @@ import de.lmu.ifi.dbs.elki.data.DoubleVector; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.QueryUtil; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; import de.lmu.ifi.dbs.elki.database.query.knn.PreprocessorKNNQuery; @@ -185,7 +186,8 @@ public class ComputeKNNOutlierScores> extends { try { MessageDigest md = MessageDigest.getInstance("MD5"); - for(DBID id : ids) { + for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + DBID id = iter.getDBID(); md.update(" ".getBytes()); md.update(id.toString().getBytes()); } @@ -330,8 +332,8 @@ public class ComputeKNNOutlierScores> extends void writeResult(PrintStream out, DBIDs ids, OutlierResult result, ScalingFunction scaling, String label) { out.append(label); Relation scores = result.getScores(); - for(DBID id : ids) { - final double value = scaling.getScaled(scores.get(id)); + for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + final double value = scaling.getScaled(scores.get(iter)); out.append(" ").append(FormatUtil.format(value, FormatUtil.NF8)); } out.append(FormatUtil.NEWLINE); diff --git a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java index cfd768c1..6b2f9e13 100644 --- a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java +++ b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java @@ -33,6 +33,7 @@ import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; @@ -43,6 +44,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.correlation.WeightedPearson import de.lmu.ifi.dbs.elki.evaluation.roc.ROC; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.math.geometry.XYCurve; import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.TiedTopBoundedHeap; import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.TopBoundedHeap; @@ -114,7 +116,7 @@ public class GreedyEnsembleExperiment extends AbstractApplication { final Database database = inputstep.getDatabase(); final Relation> relation = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD); final Relation labels = DatabaseUtil.guessLabelRepresentation(database); - final DBID firstid = labels.iterDBIDs().next(); + final DBID firstid = labels.iterDBIDs().getDBID(); final String firstlabel = labels.get(firstid); if(!firstlabel.matches("bylabel")) { throw new AbortException("No 'by label' reference outlier found, which is needed for weighting!"); @@ -137,9 +139,10 @@ public class GreedyEnsembleExperiment extends AbstractApplication { final int[] outliers_seen = new int[dim]; // Find the top-k for each ensemble member { - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); // Skip "by label", obviously - if(firstid.equals(id)) { + if(firstid.sameDBID(id)) { continue; } final NumberVector vec = relation.get(id); @@ -174,7 +177,8 @@ public class GreedyEnsembleExperiment extends AbstractApplication { // Build the naive ensemble: final double[] naiveensemble = new double[dim]; { - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); if(firstid.equals(id)) { continue; } @@ -199,7 +203,8 @@ public class GreedyEnsembleExperiment extends AbstractApplication { double bestest = Double.POSITIVE_INFINITY; { // Compute individual scores - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); if(firstid.equals(id)) { continue; } @@ -250,7 +255,8 @@ public class GreedyEnsembleExperiment extends AbstractApplication { final int heapsize = enscands.size(); TopBoundedHeap> heap = new TopBoundedHeap>(heapsize, Collections.reverseOrder()); - for(DBID id : enscands) { + for (DBIDIter iter = enscands.iter(); iter.valid(); iter.advance()) { + DBID id = iter.getDBID(); final NumberVector vec = relation.get(id); double diversity = wdist.doubleDistance(vec, greedyvec); heap.add(new DoubleObjPair(diversity, id)); @@ -301,11 +307,11 @@ public class GreedyEnsembleExperiment extends AbstractApplication { // Build the improved ensemble: StringBuffer greedylbl = new StringBuffer(); { - for(DBID id : ensemble) { + for (DBIDIter iter = ensemble.iter(); iter.valid(); iter.advance()) { if(greedylbl.length() > 0) { greedylbl.append(" "); } - greedylbl.append(labels.get(id)); + greedylbl.append(labels.get(iter)); } } NumberVector greedyvec = refvec.newNumberVector(greedyensemble); @@ -340,7 +346,8 @@ public class GreedyEnsembleExperiment extends AbstractApplication { final double[] randomensemble = new double[dim]; { DBIDs random = DBIDUtil.randomSample(candidates, ensemble.size(), (long)i); - for(DBID id : random) { + for (DBIDIter iter = random.iter(); iter.valid(); iter.advance()) { + DBID id = iter.getDBID(); assert (!firstid.equals(id)); // logger.verbose("Using: "+labels.get(id)); final NumberVector vec = relation.get(id); @@ -395,7 +402,7 @@ public class GreedyEnsembleExperiment extends AbstractApplication { scores[d] = new DoubleIntPair(vec.doubleValue(d + 1), d); } Arrays.sort(scores, Collections.reverseOrder(DoubleIntPair.BYFIRST_COMPARATOR)); - return ROC.computeAUC(ROC.materializeROC(dim, positive, Arrays.asList(scores).iterator())); + return XYCurve.areaUnderCurve(ROC.materializeROC(dim, positive, Arrays.asList(scores).iterator())); } double gain(double score, double ref, double optimal) { diff --git a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/VisualizePairwiseGainMatrix.java b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/VisualizePairwiseGainMatrix.java index 2c728878..105eeabc 100644 --- a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/VisualizePairwiseGainMatrix.java +++ b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/VisualizePairwiseGainMatrix.java @@ -45,6 +45,7 @@ import de.lmu.ifi.dbs.elki.evaluation.similaritymatrix.ComputeSimilarityMatrixIm import de.lmu.ifi.dbs.elki.evaluation.similaritymatrix.ComputeSimilarityMatrixImage.SimilarityMatrix; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.math.geometry.XYCurve; import de.lmu.ifi.dbs.elki.result.ResultUtil; import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; @@ -115,7 +116,7 @@ public class VisualizePairwiseGainMatrix extends AbstractApplication { final Database database = inputstep.getDatabase(); final Relation> relation = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD); final Relation labels = DatabaseUtil.guessLabelRepresentation(database); - final DBID firstid = labels.iterDBIDs().next(); + final DBID firstid = labels.iterDBIDs().getDBID(); final String firstlabel = labels.get(firstid); if(!firstlabel.matches("bylabel")) { throw new AbortException("No 'by label' reference outlier found, which is needed for weighting!"); @@ -153,7 +154,7 @@ public class VisualizePairwiseGainMatrix extends AbstractApplication { combined[d].second = d; } Arrays.sort(combined, Collections.reverseOrder(DoubleIntPair.BYFIRST_COMPARATOR)); - double auc = ROC.computeAUC(ROC.materializeROC(dim, pos, Arrays.asList(combined).iterator())); + double auc = XYCurve.areaUnderCurve(ROC.materializeROC(dim, pos, Arrays.asList(combined).iterator())); data[a][a] = auc; // minmax.put(auc); // logger.verbose(auc + " " + labels.get(ids.get(a))); @@ -166,7 +167,7 @@ public class VisualizePairwiseGainMatrix extends AbstractApplication { combined[d].second = d; } Arrays.sort(combined, Collections.reverseOrder(DoubleIntPair.BYFIRST_COMPARATOR)); - double auc = ROC.computeAUC(ROC.materializeROC(dim, pos, Arrays.asList(combined).iterator())); + double auc = XYCurve.areaUnderCurve(ROC.materializeROC(dim, pos, Arrays.asList(combined).iterator())); // logger.verbose(auc + " " + labels.get(ids.get(a)) + " " + // labels.get(ids.get(b))); data[a][b] = auc; diff --git a/src/de/lmu/ifi/dbs/elki/application/internal/CheckELKIServices.java b/src/de/lmu/ifi/dbs/elki/application/internal/CheckELKIServices.java index 21ed047b..21b8c698 100644 --- a/src/de/lmu/ifi/dbs/elki/application/internal/CheckELKIServices.java +++ b/src/de/lmu/ifi/dbs/elki/application/internal/CheckELKIServices.java @@ -60,7 +60,7 @@ public class CheckELKIServices { /** * Pattern to strip comments, while keeping commented class names. */ - private Pattern strip = Pattern.compile("^[\\s#]*(.*?)[\\s]*$"); + private Pattern strip = Pattern.compile("^[\\s#]*(?:deprecated:\\s*)?(.*?)[\\s]*$"); /** * Package to skip matches in - unreleased code. diff --git a/src/de/lmu/ifi/dbs/elki/application/jsmap/JSONWebServer.java b/src/de/lmu/ifi/dbs/elki/application/jsmap/JSONWebServer.java index 5b210795..2f51c743 100644 --- a/src/de/lmu/ifi/dbs/elki/application/jsmap/JSONWebServer.java +++ b/src/de/lmu/ifi/dbs/elki/application/jsmap/JSONWebServer.java @@ -41,6 +41,7 @@ import de.lmu.ifi.dbs.elki.data.spatial.Polygon; import de.lmu.ifi.dbs.elki.data.spatial.PolygonsObject; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -274,8 +275,8 @@ public class JSONWebServer implements HttpHandler { DBIDs neighbors = pred.getNeighborDBIDs(id); re.appendKeyValue("DBID", id); re.appendKeyArray("neighbors"); - for(DBID nid : neighbors) { - re.appendString(nid.toString()); + for (DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) { + re.appendString(iter.toString()); } re.closeArray(); return; @@ -316,12 +317,12 @@ public class JSONWebServer implements HttpHandler { re.appendKeyArray("scores"); Relation scores = or.getScores(); - Iterator iter = or.getOrdering().iter(scores.getDBIDs()).iterator(); - for(int i = 0; i < offset && iter.hasNext(); i++) { - iter.next(); + DBIDIter iter = or.getOrdering().iter(scores.getDBIDs()).iter(); + for(int i = 0; i < offset && iter.valid(); i++) { + iter.advance(); } - for(int i = 0; i < pagesize && iter.hasNext(); i++) { - DBID id = iter.next(); + for(int i = 0; i < pagesize && iter.valid(); i++, iter.advance()) { + DBID id = iter.getDBID(); re.startHash(); bundleToJSON(re, id); final Double val = scores.get(id); diff --git a/src/de/lmu/ifi/dbs/elki/application/visualization/KNNExplorer.java b/src/de/lmu/ifi/dbs/elki/application/visualization/KNNExplorer.java index 0419f791..c7f6d266 100644 --- a/src/de/lmu/ifi/dbs/elki/application/visualization/KNNExplorer.java +++ b/src/de/lmu/ifi/dbs/elki/application/visualization/KNNExplorer.java @@ -57,6 +57,7 @@ import de.lmu.ifi.dbs.elki.data.VectorUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.StaticArrayDatabase; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; @@ -375,7 +376,8 @@ public class KNNExplorer, D extends NumberDistance< double min = Double.MAX_VALUE; double max = Double.MIN_VALUE; - for(DBID objID : data.iterDBIDs()) { + for(DBIDIter iditer = data.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID objID = iditer.getDBID(); O vec = data.get(objID); DoubleMinMax mm = VectorUtil.getRangeDouble(vec); min = Math.min(min, mm.getMin()); @@ -408,7 +410,8 @@ public class KNNExplorer, D extends NumberDistance< svgCanvas.setPlot(plot); DefaultListModel m = new DefaultListModel(); - for(DBID dbid : data.iterDBIDs()) { + for(DBIDIter iditer = data.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID dbid = iditer.getDBID(); m.addElement(dbid); } seriesList.setModel(m); @@ -449,7 +452,7 @@ public class KNNExplorer, D extends NumberDistance< double dist = pair.getDistance().doubleValue() / maxdist; Color color = getColor(dist); String colstr = "#" + Integer.toHexString(color.getRGB()).substring(2); - String width = (pair.getDBID().equals(idx)) ? "0.5%" : "0.2%"; + String width = (pair.getDBID().sameDBID(idx)) ? "0.5%" : "0.2%"; SVGUtil.setStyle(line, "stroke: " + colstr + "; stroke-width: " + width + "; fill: none"); newe.appendChild(line); // put into cache diff --git a/src/de/lmu/ifi/dbs/elki/data/Cluster.java b/src/de/lmu/ifi/dbs/elki/data/Cluster.java index 6863e995..88f610ea 100644 --- a/src/de/lmu/ifi/dbs/elki/data/Cluster.java +++ b/src/de/lmu/ifi/dbs/elki/data/Cluster.java @@ -26,6 +26,7 @@ package de.lmu.ifi.dbs.elki.data; import java.util.ArrayList; import java.util.Comparator; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Set; @@ -37,12 +38,11 @@ import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.Hierarchical; import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.Hierarchy; import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.HierarchyReferenceLists; import de.lmu.ifi.dbs.elki.utilities.iterator.EmptyIterator; -import de.lmu.ifi.dbs.elki.utilities.iterator.IterableIterator; /** * Generic cluster class, that may or not have hierarchical information. Note - * that every cluster MUST have a DBIDs, since it implements the - * interface, too. Calls to the interface are proxied to the inner group object. + * that every cluster MUST have a DBIDs, since it implements the interface, too. + * Calls to the interface are proxied to the inner group object. * * A hierarchy object of class SimpleHierarchy will be created automatically * when a list of parents and children is provided. Alternatively, a @@ -271,7 +271,7 @@ public class Cluster implements Hierarchical>, TextW * Delegate to hierarchy object */ @Override - public IterableIterator> iterDescendants() { + public Iterator> iterDescendants() { if(hierarchy == null) { return EmptyIterator.STATIC(); } @@ -286,8 +286,8 @@ public class Cluster implements Hierarchical>, TextW public Set> getDescendants() { HashSet> set = new HashSet>(); // add all - for (Cluster c : iterDescendants()) { - set.add(c); + for(Iterator> iter = iterDescendants(); iter.hasNext();) { + set.add(iter.next()); } return set; } @@ -318,7 +318,7 @@ public class Cluster implements Hierarchical>, TextW * Delegate to hierarchy object */ @Override - public IterableIterator> iterAncestors() { + public Iterator> iterAncestors() { if(hierarchy == null) { return EmptyIterator.STATIC(); } @@ -520,6 +520,6 @@ public class Cluster implements Hierarchical>, TextW public String toString() { String mstr = (model == null) ? "null" : model.toString(); String nstr = noise ? ",noise" : ""; - return "Cluster(size="+size()+",model="+mstr+nstr+")"; + return "Cluster(size=" + size() + ",model=" + mstr + nstr + ")"; } } \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/data/Clustering.java b/src/de/lmu/ifi/dbs/elki/data/Clustering.java index eaac4418..f4312c4c 100644 --- a/src/de/lmu/ifi/dbs/elki/data/Clustering.java +++ b/src/de/lmu/ifi/dbs/elki/data/Clustering.java @@ -26,6 +26,7 @@ package de.lmu.ifi.dbs.elki.data; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Set; @@ -105,8 +106,8 @@ public class Clustering extends BasicResult { for(Cluster rc : toplevelclusters) { if(!clu.contains(rc)) { clu.add(rc); - for (Cluster c : rc.iterDescendants()) { - clu.add(c); + for (Iterator> iter = rc.iterDescendants(); iter.hasNext(); ) { + clu.add(iter.next()); } } } diff --git a/src/de/lmu/ifi/dbs/elki/data/NumberVector.java b/src/de/lmu/ifi/dbs/elki/data/NumberVector.java index 1964277c..dcb869f8 100644 --- a/src/de/lmu/ifi/dbs/elki/data/NumberVector.java +++ b/src/de/lmu/ifi/dbs/elki/data/NumberVector.java @@ -38,7 +38,6 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable; * @param the type of the attribute values * * @apiviz.landmark - * @apiviz.has Matrix * @apiviz.has Vector */ public interface NumberVector, N extends Number> extends FeatureVector, SpatialComparable, Parameterizable { diff --git a/src/de/lmu/ifi/dbs/elki/data/SparseDoubleVector.java b/src/de/lmu/ifi/dbs/elki/data/SparseDoubleVector.java new file mode 100644 index 00000000..10536058 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/data/SparseDoubleVector.java @@ -0,0 +1,341 @@ +package de.lmu.ifi.dbs.elki.data; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ + +import gnu.trove.impl.unmodifiable.TUnmodifiableIntDoubleMap; +import gnu.trove.iterator.TIntDoubleIterator; +import gnu.trove.map.TIntDoubleMap; +import gnu.trove.map.hash.TIntDoubleHashMap; + +import java.util.Arrays; +import java.util.BitSet; + +import de.lmu.ifi.dbs.elki.datasource.parser.SparseNumberVectorLabelParser; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayAdapter; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + *

+ * A SparseDoubleVector is to store real values as double values. + *

+ * + * A SparseDoubleVector only requires storage for those attribute values that are + * non-zero. + * + * @author Arthur Zimek + */ +// TODO: implement ByteArraySerializer +public class SparseDoubleVector extends AbstractNumberVector implements SparseNumberVector { + /** + * Static instance + */ + public static final SparseDoubleVector STATIC = new SparseDoubleVector(new int[0], new double[0], -1); + + /** + * Indexes of values + */ + private int[] indexes; + + /** + * Stored values + */ + private double[] values; + + /** + * The dimensionality of this feature vector. + */ + private int dimensionality; + + /** + * Direct constructor. + * + * @param indexes Indexes Must be sorted! + * @param values Associated value. + * @param dimensionality "true" dimensionality + */ + public SparseDoubleVector(int[] indexes, double[] values, int dimensionality) { + super(); + this.indexes = indexes; + this.values = values; + this.dimensionality = dimensionality; + } + + /** + * Provides a SparseDoubleVector consisting of double values according to the + * specified mapping of indices and values. + * + * @param values the values to be set as values of the real vector + * @param dimensionality the dimensionality of this feature vector + * @throws IllegalArgumentException if the given dimensionality is too small + * to cover the given values (i.e., the maximum index of any value not + * zero is bigger than the given dimensionality) + */ + public SparseDoubleVector(TIntDoubleMap values, int dimensionality) throws IllegalArgumentException { + if(values.size() > dimensionality) { + throw new IllegalArgumentException("values.size() > dimensionality!"); + } + + this.indexes = new int[values.size()]; + this.values = new double[values.size()]; + // Import and sort the indexes + { + TIntDoubleIterator iter = values.iterator(); + for (int i = 0; iter.hasNext(); i++) { + iter.advance(); + this.indexes[i] = iter.key(); + } + Arrays.sort(this.indexes); + } + // Import the values accordingly + { + for(int i = 0; i < values.size(); i++) { + this.values[i] = values.get(this.indexes[i]); + } + } + this.dimensionality = dimensionality; + final int maxdim = getMaxDim(); + if(maxdim > dimensionality) { + throw new IllegalArgumentException("Given dimensionality " + dimensionality + " is too small w.r.t. the given values (occurring maximum: " + maxdim + ")."); + } + } + + /** + * Get the maximum dimensionality. + * + * @return the maximum dimensionality seen + */ + private int getMaxDim() { + if(this.indexes.length == 0) { + return 0; + } + else { + return this.indexes[this.indexes.length - 1]; + } + } + + /** + * Provides a SparseDoubleVector consisting of double values according to the + * specified mapping of indices and values. + * + * @param values the values to be set as values of the real vector + * @throws IllegalArgumentException if the given dimensionality is too small + * to cover the given values (i.e., the maximum index of any value not + * zero is bigger than the given dimensionality) + */ + public SparseDoubleVector(double[] values) throws IllegalArgumentException { + this.dimensionality = values.length; + + // Count the number of non-zero entries + int size = 0; + { + for(int i = 0; i < values.length; i++) { + if(values[i] != 0.0f) { + size++; + } + } + } + this.indexes = new int[size]; + this.values = new double[size]; + + // Copy the values + { + int pos = 0; + for(int i = 0; i < values.length; i++) { + double value = values[i]; + if(value != 0.0f) { + this.indexes[pos] = i + 1; + this.values[pos] = value; + pos++; + } + } + } + } + + @Override + public int getDimensionality() { + return dimensionality; + } + + /** + * Sets the dimensionality to the new value. + * + * + * @param dimensionality the new dimensionality + * @throws IllegalArgumentException if the given dimensionality is too small + * to cover the given values (i.e., the maximum index of any value not + * zero is bigger than the given dimensionality) + */ + @Override + public void setDimensionality(int dimensionality) throws IllegalArgumentException { + final int maxdim = getMaxDim(); + if(maxdim > dimensionality) { + throw new IllegalArgumentException("Given dimensionality " + dimensionality + " is too small w.r.t. the given values (occurring maximum: " + maxdim + ")."); + } + this.dimensionality = dimensionality; + } + + @Override + public Double getValue(int dimension) { + int pos = Arrays.binarySearch(this.indexes, dimension); + if(pos >= 0) { + return values[pos]; + } + else { + return 0.0; + } + } + + @Override + public double doubleValue(int dimension) { + int pos = Arrays.binarySearch(this.indexes, dimension); + if(pos >= 0) { + return values[pos]; + } + else { + return 0.0; + } + } + + @Override + public long longValue(int dimension) { + int pos = Arrays.binarySearch(this.indexes, dimension); + if(pos >= 0) { + return (long) values[pos]; + } + else { + return 0; + } + } + + @Override + public Vector getColumnVector() { + double[] values = getValues(); + return new Vector(values); + } + + /** + *

+ * Provides a String representation of this SparseDoubleVector as suitable for + * {@link SparseNumberVectorLabelParser}. + *

+ * + *

+ * The returned String is a single line with entries separated by + * {@link AbstractNumberVector#ATTRIBUTE_SEPARATOR}. The first entry gives the + * number of values actually not zero. Following entries are pairs of Integer + * and Double where the Integer gives the index of the dimensionality and the + * Double gives the corresponding value. + *

+ * + *

+ * Example: a vector (0,1.2,1.3,0)T would result in the String
+ * 2 2 1.2 3 1.3
+ *

+ * + * @return a String representation of this SparseDoubleVector + */ + @Override + public String toString() { + StringBuilder featureLine = new StringBuilder(); + featureLine.append(this.indexes.length); + for(int i = 0; i < this.indexes.length; i++) { + featureLine.append(ATTRIBUTE_SEPARATOR); + featureLine.append(this.indexes[i]); + featureLine.append(ATTRIBUTE_SEPARATOR); + featureLine.append(this.values[i]); + } + + return featureLine.toString(); + } + + /** + * Returns an array consisting of the values of this feature vector. + * + * @return an array consisting of the values of this feature vector + */ + private double[] getValues() { + double[] values = new double[dimensionality]; + for(int i = 0; i < indexes.length; i++) { + values[this.indexes[i]] = this.values[i]; + } + return values; + } + + @Override + public SparseDoubleVector newFeatureVector(A array, ArrayAdapter adapter) { + int dim = adapter.size(array); + double[] values = new double[dim]; + for(int i = 0; i < dim; i++) { + values[i] = adapter.get(array, i); + } + // TODO: inefficient + return new SparseDoubleVector(values); + } + + @Override + public SparseDoubleVector newNumberVector(A array, NumberArrayAdapter adapter) { + int dim = adapter.size(array); + double[] values = new double[dim]; + for(int i = 0; i < dim; i++) { + values[i] = adapter.getDouble(array, i); + } + // TODO: inefficient + return new SparseDoubleVector(values); + } + + @Override + public SparseDoubleVector newNumberVector(TIntDoubleMap values, int maxdim) { + return new SparseDoubleVector(values, maxdim); + } + + @Override + public BitSet getNotNullMask() { + BitSet b = new BitSet(); + for(int key : indexes) { + b.set(key); + } + return b; + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + @Override + protected SparseDoubleVector makeInstance() { + return STATIC; + } + } + + /** + * Empty map. + */ + public static final TIntDoubleMap EMPTYMAP = new TUnmodifiableIntDoubleMap(new TIntDoubleHashMap()); +} diff --git a/src/de/lmu/ifi/dbs/elki/data/SparseFloatVector.java b/src/de/lmu/ifi/dbs/elki/data/SparseFloatVector.java index 1ce8c5f7..36a4e171 100644 --- a/src/de/lmu/ifi/dbs/elki/data/SparseFloatVector.java +++ b/src/de/lmu/ifi/dbs/elki/data/SparseFloatVector.java @@ -24,14 +24,16 @@ package de.lmu.ifi.dbs.elki.data; */ import gnu.trove.impl.unmodifiable.TUnmodifiableIntFloatMap; +import gnu.trove.iterator.TIntDoubleIterator; import gnu.trove.iterator.TIntFloatIterator; +import gnu.trove.map.TIntDoubleMap; import gnu.trove.map.TIntFloatMap; import gnu.trove.map.hash.TIntFloatHashMap; import java.util.Arrays; import java.util.BitSet; -import de.lmu.ifi.dbs.elki.datasource.parser.SparseFloatVectorLabelParser; +import de.lmu.ifi.dbs.elki.datasource.parser.SparseNumberVectorLabelParser; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayAdapter; import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; @@ -103,9 +105,9 @@ public class SparseFloatVector extends AbstractNumberVector dimensionality) { @@ -238,7 +241,7 @@ public class SparseFloatVector extends AbstractNumberVector * Provides a String representation of this SparseFloatVector as suitable for - * {@link SparseFloatVectorLabelParser}. + * {@link SparseNumberVectorLabelParser}. *

* *

@@ -305,6 +308,24 @@ public class SparseFloatVector extends AbstractNumberVector. */ - +import gnu.trove.map.TIntDoubleMap; /** * Combines the SparseFeatureVector and NumberVector * * @author Erich Schubert - * - * @param - * @param + * + * @param Vector type number type + * @param Number type */ public interface SparseNumberVector, N extends Number> extends NumberVector, SparseFeatureVector { - // Empty combination interface + /** + * Returns a new NumberVector of N for the given values. + * + * @param values the values of the NumberVector + * @param maxdim Maximum dimensionality. + * @return a new NumberVector of N for the given values + */ + V newNumberVector(TIntDoubleMap values, int maxdim); + + /** + * Update the vector space dimensionality. + * + * @param maxdim New dimensionality + */ + void setDimensionality(int maxdim); } diff --git a/src/de/lmu/ifi/dbs/elki/data/VectorUtil.java b/src/de/lmu/ifi/dbs/elki/data/VectorUtil.java index 1607b482..c8bf2c02 100644 --- a/src/de/lmu/ifi/dbs/elki/data/VectorUtil.java +++ b/src/de/lmu/ifi/dbs/elki/data/VectorUtil.java @@ -24,12 +24,20 @@ package de.lmu.ifi.dbs.elki.data; */ import java.util.BitSet; +import java.util.Comparator; import java.util.Random; import de.lmu.ifi.dbs.elki.data.spatial.SpatialComparable; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; import de.lmu.ifi.dbs.elki.math.MathUtil; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect; /** * Utility functions for use with vectors. @@ -268,4 +276,115 @@ public final class VectorUtil { } return result; } + + /** + * Compute medoid for a given subset. + * + * @param relation Relation to process + * @param sample Sample set + * @return Medoid vector + */ + public static Vector computeMedoid(Relation> relation, DBIDs sample) { + final int dim = DatabaseUtil.dimensionality(relation); + ArrayModifiableDBIDs mids = DBIDUtil.newArray(sample); + SortDBIDsBySingleDimension s = new SortDBIDsBySingleDimension(relation); + Vector medoid = new Vector(dim); + for (int d = 0; d < dim; d++) { + s.setDimension(d + 1); + medoid.set(d, relation.get(QuickSelect.median(mids, s)).doubleValue(d + 1)); + } + return medoid; + } + + /** + * Compare number vectors by a single dimension + * + * @author Erich Schubert + */ + public static class SortDBIDsBySingleDimension implements Comparator { + /** + * Dimension to sort with + */ + public int d; + + /** + * The relation to sort. + */ + private Relation> data; + + /** + * Constructor. + * + * @param data Vector data source + */ + public SortDBIDsBySingleDimension(Relation> data) { + super(); + this.data = data; + }; + + /** + * Get the dimension to sort by + * + * @return Dimension to sort with + */ + public int getDimension() { + return this.d; + } + + /** + * Set the dimension to sort by + * + * @param d Dimension to sort with + */ + public void setDimension(int d) { + this.d = d; + } + + @Override + public int compare(DBID id1, DBID id2) { + return Double.compare(data.get(id1).doubleValue(d), data.get(id2).doubleValue(d)); + } + } + + /** + * Compare number vectors by a single dimension + * + * @author Erich Schubert + */ + public static class SortVectorsBySingleDimension implements Comparator> { + /** + * Dimension to sort with + */ + public int d; + + /** + * Constructor. + */ + public SortVectorsBySingleDimension() { + super(); + }; + + /** + * Get the dimension to sort by + * + * @return Dimension to sort with + */ + public int getDimension() { + return this.d; + } + + /** + * Set the dimension to sort by + * + * @param d Dimension to sort with + */ + public void setDimension(int d) { + this.d = d; + } + + @Override + public int compare(NumberVector o1, NumberVector o2) { + return Double.compare(o1.doubleValue(d), o2.doubleValue(d)); + } + } } \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/data/model/CorrelationAnalysisSolution.java b/src/de/lmu/ifi/dbs/elki/data/model/CorrelationAnalysisSolution.java index e6cb893f..56fe8f9d 100644 --- a/src/de/lmu/ifi/dbs/elki/data/model/CorrelationAnalysisSolution.java +++ b/src/de/lmu/ifi/dbs/elki/data/model/CorrelationAnalysisSolution.java @@ -27,15 +27,15 @@ import java.text.NumberFormat; import java.util.Locale; import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException; +import de.lmu.ifi.dbs.elki.datasource.filter.normalization.Normalization; import de.lmu.ifi.dbs.elki.logging.LoggingUtil; import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem; import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; -import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException; -import de.lmu.ifi.dbs.elki.datasource.filter.normalization.Normalization; import de.lmu.ifi.dbs.elki.result.Result; import de.lmu.ifi.dbs.elki.result.textwriter.TextWriteable; import de.lmu.ifi.dbs.elki.result.textwriter.TextWriterStream; @@ -141,8 +141,8 @@ public class CorrelationAnalysisSolution> implement // determine standard deviation double variance = 0; DBIDs ids = db.getDBIDs(); - for (DBID id : ids) { - double distance = distance(db.get(id).getColumnVector()); + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + double distance = distance(db.get(iter).getColumnVector()); variance += distance * distance; } standardDeviation = Math.sqrt(variance / ids.size()); @@ -213,7 +213,7 @@ public class CorrelationAnalysisSolution> implement * @return the error vectors */ public Vector errorVector(V p) { - return p.getColumnVector().minus(centroid).projection(weakEigenvectors); + return p.getColumnVector().minusEquals(centroid).projection(weakEigenvectors); } /** @@ -223,7 +223,7 @@ public class CorrelationAnalysisSolution> implement * @return the data projections */ public Matrix dataProjections(V p) { - Vector centered = p.getColumnVector().minus(centroid); + Vector centered = p.getColumnVector().minusEquals(centroid); Matrix sum = new Matrix(p.getDimensionality(), strongEigenvectors.getColumnDimensionality()); for(int i = 0; i < strongEigenvectors.getColumnDimensionality(); i++) { Vector v_i = strongEigenvectors.getCol(i); @@ -240,7 +240,7 @@ public class CorrelationAnalysisSolution> implement * @return the error vectors */ public Vector dataVector(V p) { - return p.getColumnVector().minus(centroid).projection(strongEigenvectors); + return p.getColumnVector().minusEquals(centroid).projection(strongEigenvectors); } /** @@ -254,21 +254,21 @@ public class CorrelationAnalysisSolution> implement } /** - * Returns a copy of the strong eigenvectors. + * Returns the strong eigenvectors. * - * @return a copy of the strong eigenvectors + * @return the strong eigenvectors */ public Matrix getStrongEigenvectors() { - return strongEigenvectors.copy(); + return strongEigenvectors; } /** - * Returns a copy of the weak eigenvectors. + * Returns the weak eigenvectors. * - * @return a copy of the weak eigenvectors + * @return the weak eigenvectors */ public Matrix getWeakEigenvectors() { - return weakEigenvectors.copy(); + return weakEigenvectors; } /** diff --git a/src/de/lmu/ifi/dbs/elki/data/model/MedoidModel.java b/src/de/lmu/ifi/dbs/elki/data/model/MedoidModel.java new file mode 100644 index 00000000..5540b931 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/data/model/MedoidModel.java @@ -0,0 +1,76 @@ +package de.lmu.ifi.dbs.elki.data.model; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ + +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.result.textwriter.TextWriteable; +import de.lmu.ifi.dbs.elki.result.textwriter.TextWriterStream; + +/** + * Cluster model that stores a mean for the cluster. + * + * @author Erich Schubert + */ +public class MedoidModel extends BaseModel implements TextWriteable { + /** + * Cluster medoid + */ + private DBID medoid; + + /** + * Constructor with medoid + * + * @param medoid Cluster medoid + */ + public MedoidModel(DBID medoid) { + super(); + this.medoid = medoid; + } + + /** + * @return medoid + */ + public DBID getMedoid() { + return medoid; + } + + /** + * @param medoid Medoid object + */ + public void setMedoid(DBID medoid) { + this.medoid = medoid; + } + + /** + * Implementation of {@link TextWriteable} interface. + */ + @Override + public void writeToText(TextWriterStream out, String label) { + if(label != null) { + out.commentPrintLn(label); + } + out.commentPrintLn(TextWriterStream.SER_MARKER + " " + getClass().getName()); + out.commentPrintLn("Cluster Medoid: " + medoid.toString()); + } +} diff --git a/src/de/lmu/ifi/dbs/elki/data/projection/AbstractFeatureSelection.java b/src/de/lmu/ifi/dbs/elki/data/projection/AbstractFeatureSelection.java deleted file mode 100644 index fcd5fd84..00000000 --- a/src/de/lmu/ifi/dbs/elki/data/projection/AbstractFeatureSelection.java +++ /dev/null @@ -1,64 +0,0 @@ -package de.lmu.ifi.dbs.elki.data.projection; - -import de.lmu.ifi.dbs.elki.data.FeatureVector; -/* - This file is part of ELKI: - Environment for Developing KDD-Applications Supported by Index-Structures - - Copyright (C) 2012 - Ludwig-Maximilians-Universität München - Lehr- und Forschungseinheit für Datenbanksysteme - ELKI Development Team - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see . - */ -import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; -import de.lmu.ifi.dbs.elki.data.type.TypeInformation; -import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.SubsetArrayAdapter; - -/** - * Abstract feature selection projection - * - * @author Erich Schubert - * - * @param Vector type - * @param Feature type - */ -public abstract class AbstractFeatureSelection, F> implements Projection { - /** - * Array adapter - */ - protected SubsetArrayAdapter adapter; - - /** - * Constructor. - * - * @param adapter Data adapter - */ - public AbstractFeatureSelection(SubsetArrayAdapter adapter) { - super(); - this.adapter = adapter; - } - - @Override - public V project(V data) { - return data.newFeatureVector(data, adapter); - } - - @Override - abstract public SimpleTypeInformation getOutputDataTypeInformation(); - - @Override - abstract public TypeInformation getInputDataTypeInformation(); -} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/data/projection/FeatureSelection.java b/src/de/lmu/ifi/dbs/elki/data/projection/FeatureSelection.java index fca75e3f..04878aea 100644 --- a/src/de/lmu/ifi/dbs/elki/data/projection/FeatureSelection.java +++ b/src/de/lmu/ifi/dbs/elki/data/projection/FeatureSelection.java @@ -39,7 +39,7 @@ import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.SubsetArrayAdapter * @param Vector type * @param Feature type */ -public class FeatureSelection, F> extends AbstractFeatureSelection { +public class FeatureSelection, F> implements Projection { /** * Minimum dimensionality required for projection */ @@ -55,6 +55,11 @@ public class FeatureSelection, F> extends Abstract */ private int dimensionality; + /** + * Array adapter + */ + protected ArrayAdapter adapter; + /** * Constructor. * @@ -62,7 +67,7 @@ public class FeatureSelection, F> extends Abstract * @param factory Object factory */ public FeatureSelection(int[] dims, V factory) { - super(new SubsetArrayAdapter(getAdapter(factory), dims)); + this.adapter = new SubsetArrayAdapter(getAdapter(factory), dims); this.factory = factory; this.dimensionality = dims.length; @@ -73,6 +78,11 @@ public class FeatureSelection, F> extends Abstract this.mindim = mindim; } + @Override + public V project(V data) { + return data.newFeatureVector(data, adapter); + } + /** * Choose the best adapter for this. * diff --git a/src/de/lmu/ifi/dbs/elki/data/projection/NumericalFeatureSelection.java b/src/de/lmu/ifi/dbs/elki/data/projection/NumericalFeatureSelection.java index 7dfd580f..bd41e1cf 100644 --- a/src/de/lmu/ifi/dbs/elki/data/projection/NumericalFeatureSelection.java +++ b/src/de/lmu/ifi/dbs/elki/data/projection/NumericalFeatureSelection.java @@ -1,4 +1,5 @@ package de.lmu.ifi.dbs.elki.data.projection; + /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures @@ -22,13 +23,12 @@ package de.lmu.ifi.dbs.elki.data.projection; along with this program. If not, see . */ +import java.util.BitSet; + import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation; -import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil; -import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; -import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.SubsetArrayAdapter; /** * Projection class for number vectors. @@ -36,9 +36,8 @@ import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.SubsetArrayAdapter * @author Erich Schubert * * @param Vector type - * @param Number type */ -public class NumericalFeatureSelection, N extends Number> extends AbstractFeatureSelection { +public class NumericalFeatureSelection> implements Projection { /** * Minimum dimensionality required for projection */ @@ -54,38 +53,37 @@ public class NumericalFeatureSelection, N extends N */ private int dimensionality; + /** + * Subspace + */ + private BitSet bits; + /** * Constructor. * - * @param dims Dimensions + * @param bits Dimensions * @param factory Object factory */ - public NumericalFeatureSelection(int[] dims, V factory) { - super(new SubsetArrayAdapter(getAdapter(factory), dims)); + public NumericalFeatureSelection(BitSet bits, V factory) { + super(); + this.bits = bits; this.factory = factory; - this.dimensionality = dims.length; + this.dimensionality = bits.cardinality(); int mindim = 0; - for(int dim : dims) { - mindim = Math.max(mindim, dim + 1); + for(int i = bits.nextSetBit(0); i >= 0; i = bits.nextSetBit(i + 1)) { + mindim = Math.max(mindim, i + 1); } this.mindim = mindim; } - /** - * Choose the best adapter for this. - * - * @param factory Object factory, for type inference - * @return Adapter - */ - private static , N extends Number> NumberArrayAdapter getAdapter(V factory) { - return ArrayLikeUtil.numberVectorAdapter(factory); - } - - @SuppressWarnings("unchecked") @Override public V project(V data) { - return factory.newNumberVector(data, (NumberArrayAdapter) adapter); + double[] dbl = new double[dimensionality]; + for(int i = bits.nextSetBit(0), j = 0; i >= 0; i = bits.nextSetBit(i + 1), j++) { + dbl[j] = data.doubleValue(i + 1); + } + return factory.newNumberVector(dbl); } @Override diff --git a/src/de/lmu/ifi/dbs/elki/data/synthetic/bymodel/GeneratorMain.java b/src/de/lmu/ifi/dbs/elki/data/synthetic/bymodel/GeneratorMain.java index 6870fcaa..4d93e1ea 100644 --- a/src/de/lmu/ifi/dbs/elki/data/synthetic/bymodel/GeneratorMain.java +++ b/src/de/lmu/ifi/dbs/elki/data/synthetic/bymodel/GeneratorMain.java @@ -70,7 +70,12 @@ public class GeneratorMain { /** * List of clusters to generate */ - private LinkedList generators = new LinkedList(); + protected LinkedList generators = new LinkedList(); + + /** + * Controls whether points are tested against the model during generation + */ + protected boolean testAgainstModel = true; /** * Add a cluster to the cluster list. @@ -81,11 +86,6 @@ public class GeneratorMain { generators.add(c); } - /** - * Controls whether points are tested against the model during generation - */ - private boolean testAgainstModel = true; - /** * Main loop to generate data set. * @@ -154,6 +154,13 @@ public class GeneratorMain { cursclus.incrementDiscarded(); } } + } else { + // Keep all. + for (Vector p : newp) { + DoubleVector dv = new DoubleVector(p); + bundle.appendSimple(dv, l, model); + ++kept; + } } } } diff --git a/src/de/lmu/ifi/dbs/elki/data/synthetic/bymodel/GeneratorSingleCluster.java b/src/de/lmu/ifi/dbs/elki/data/synthetic/bymodel/GeneratorSingleCluster.java index f9818916..d2970de7 100644 --- a/src/de/lmu/ifi/dbs/elki/data/synthetic/bymodel/GeneratorSingleCluster.java +++ b/src/de/lmu/ifi/dbs/elki/data/synthetic/bymodel/GeneratorSingleCluster.java @@ -30,7 +30,7 @@ import java.util.Random; import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.math.linearalgebra.AffineTransformation; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; -import de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution; +import de.lmu.ifi.dbs.elki.math.statistics.distribution.DistributionWithRandom; import de.lmu.ifi.dbs.elki.utilities.exceptions.UnableToComplyException; /** @@ -40,14 +40,14 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.UnableToComplyException; * * @author Erich Schubert * - * @apiviz.composedOf Distribution + * @apiviz.composedOf DistributionWithRandom * @apiviz.composedOf AffineTransformation */ public class GeneratorSingleCluster implements GeneratorInterfaceDynamic, Model { /** * The distribution generators for each axis */ - private List axes = new ArrayList(); + private List axes = new ArrayList(); /** * The transformation matrix @@ -121,7 +121,7 @@ public class GeneratorSingleCluster implements GeneratorInterfaceDynamic, Model * @throws UnableToComplyException thrown when no new generators may be added * anymore */ - public void addGenerator(Distribution gen) throws UnableToComplyException { + public void addGenerator(DistributionWithRandom gen) throws UnableToComplyException { if(trans != null) { throw new UnableToComplyException("Generators may no longer be added when transformations have been applied."); } @@ -235,7 +235,7 @@ public class GeneratorSingleCluster implements GeneratorInterfaceDynamic, Model while(result.size() < count) { double[] d = new double[dim]; int i = 0; - for(Distribution axis : axes) { + for(DistributionWithRandom axis : axes) { d[i] = axis.nextRandom(); i++; } @@ -269,7 +269,7 @@ public class GeneratorSingleCluster implements GeneratorInterfaceDynamic, Model double density = 1.0; int i = 0; - for(Distribution axis : axes) { + for(DistributionWithRandom axis : axes) { density = density * axis.pdf(o.get(i)); i++; } @@ -389,6 +389,7 @@ public class GeneratorSingleCluster implements GeneratorInterfaceDynamic, Model * * @return Model */ + @Override public Model makeModel() { return this; } @@ -399,7 +400,7 @@ public class GeneratorSingleCluster implements GeneratorInterfaceDynamic, Model * @param i Generator axis i * @return Distribution */ - public Distribution getDistribution(int i) { + public DistributionWithRandom getDistribution(int i) { return axes.get(i); } } \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/data/type/NoSupportedDataTypeException.java b/src/de/lmu/ifi/dbs/elki/data/type/NoSupportedDataTypeException.java index 9865e96f..188f962c 100644 --- a/src/de/lmu/ifi/dbs/elki/data/type/NoSupportedDataTypeException.java +++ b/src/de/lmu/ifi/dbs/elki/data/type/NoSupportedDataTypeException.java @@ -23,6 +23,7 @@ package de.lmu.ifi.dbs.elki.data.type; along with this program. If not, see . */ +import java.util.Collection; /** * Exception thrown when no supported data type was found. @@ -37,19 +38,41 @@ public class NoSupportedDataTypeException extends IllegalStateException { */ private static final long serialVersionUID = 1L; + /** + * Available types + */ + private Collection types = null; + /** * Constructor. + * + * @param type Requested type + * @param types Available types. */ - public NoSupportedDataTypeException(TypeInformation type) { + public NoSupportedDataTypeException(TypeInformation type, Collection types) { super("No data type found satisfying: " + type.toString()); + this.types = types; } /** - * Constructor with string message. If possible, use the type parameter instead! - * + * Constructor with string message. If possible, use the type parameter + * instead! + * * @param string Error message */ public NoSupportedDataTypeException(String string) { super(string); } + + @Override + public String getMessage() { + StringBuffer buf = new StringBuffer(super.getMessage()); + if(types != null) { + buf.append("\nAvailable types:"); + for(TypeInformation type : types) { + buf.append(" ").append(type.toString()); + } + } + return buf.toString(); + } } \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/data/type/TypeUtil.java b/src/de/lmu/ifi/dbs/elki/data/type/TypeUtil.java index 91257906..4d62a453 100644 --- a/src/de/lmu/ifi/dbs/elki/data/type/TypeUtil.java +++ b/src/de/lmu/ifi/dbs/elki/data/type/TypeUtil.java @@ -27,13 +27,18 @@ import de.lmu.ifi.dbs.elki.data.BitVector; import de.lmu.ifi.dbs.elki.data.ClassLabel; import de.lmu.ifi.dbs.elki.data.DoubleVector; import de.lmu.ifi.dbs.elki.data.ExternalID; +import de.lmu.ifi.dbs.elki.data.FloatVector; import de.lmu.ifi.dbs.elki.data.LabelList; import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.SparseDoubleVector; import de.lmu.ifi.dbs.elki.data.SparseFloatVector; +import de.lmu.ifi.dbs.elki.data.SparseNumberVector; import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.data.spatial.PolygonsObject; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDFactory; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.query.DistanceDBIDResult; import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.persistent.ByteArrayUtil; @@ -57,6 +62,11 @@ public final class TypeUtil { */ public static final SimpleTypeInformation DBID = new SimpleTypeInformation(DBID.class, DBIDFactory.FACTORY.getDBIDSerializer()); + /** + * Database ID lists + */ + public static final SimpleTypeInformation DBIDS = new SimpleTypeInformation(DBIDs.class); + /** * A string */ @@ -72,6 +82,11 @@ public final class TypeUtil { */ public static final SimpleTypeInformation LABELLIST = new SimpleTypeInformation(LabelList.class); + /** + * A list of neighbors + */ + public static final SimpleTypeInformation> NEIGHBORLIST = new SimpleTypeInformation>(DistanceDBIDResult.class); + /** * Either class label, object labels or a string - anything that will be * accepted by @@ -96,6 +111,13 @@ public final class TypeUtil { */ public static final VectorFieldTypeInformation DOUBLE_VECTOR_FIELD = new VectorFieldTypeInformation(DoubleVector.class, DoubleVector.STATIC); + /** + * Input type for algorithms that require number vector fields. + * + * If possible, please use {@link #NUMBER_VECTOR_FIELD}! + */ + public static final VectorFieldTypeInformation FLOAT_VECTOR_FIELD = new VectorFieldTypeInformation(FloatVector.class, FloatVector.STATIC); + /** * Input type for algorithms that require number vector fields. */ @@ -104,8 +126,27 @@ public final class TypeUtil { /** * Sparse float vector field. */ + public static final SimpleTypeInformation> SPARSE_VECTOR_VARIABLE_LENGTH = new SimpleTypeInformation>(SparseNumberVector.class); + + /** + * Sparse vector field. + */ + public static final VectorFieldTypeInformation> SPARSE_VECTOR_FIELD = new VectorFieldTypeInformation>(SparseNumberVector.class); + + /** + * Sparse float vector field. + * + * If possible, please use {@link #SPARSE_VECTOR_FIELD} instead! + */ public static final VectorFieldTypeInformation SPARSE_FLOAT_FIELD = new VectorFieldTypeInformation(SparseFloatVector.class); + /** + * Sparse double vector field. + * + * If possible, please use {@link #SPARSE_VECTOR_FIELD} instead! + */ + public static final VectorFieldTypeInformation SPARSE_DOUBLE_FIELD = new VectorFieldTypeInformation(SparseDoubleVector.class); + /** * External ID type */ diff --git a/src/de/lmu/ifi/dbs/elki/data/type/VectorFieldTypeInformation.java b/src/de/lmu/ifi/dbs/elki/data/type/VectorFieldTypeInformation.java index 69e0c969..9e6c2470 100644 --- a/src/de/lmu/ifi/dbs/elki/data/type/VectorFieldTypeInformation.java +++ b/src/de/lmu/ifi/dbs/elki/data/type/VectorFieldTypeInformation.java @@ -234,12 +234,20 @@ public class VectorFieldTypeInformation> extends V @Override public String toString() { + StringBuffer buf = new StringBuffer(getRestrictionClass().getSimpleName()); if(mindim == maxdim) { - return getRestrictionClass().getSimpleName() + ",dim=" + mindim; + buf.append(",dim=").append(mindim); } else { - return super.toString(); + buf.append(",field"); + if(mindim >= 0) { + buf.append(",mindim=" + mindim); + } + if(maxdim < Integer.MAX_VALUE) { + buf.append(",maxdim=" + maxdim); + } } + return buf.toString(); } /** diff --git a/src/de/lmu/ifi/dbs/elki/data/type/VectorTypeInformation.java b/src/de/lmu/ifi/dbs/elki/data/type/VectorTypeInformation.java index 012cff48..a1db6d1c 100644 --- a/src/de/lmu/ifi/dbs/elki/data/type/VectorTypeInformation.java +++ b/src/de/lmu/ifi/dbs/elki/data/type/VectorTypeInformation.java @@ -180,6 +180,7 @@ public class VectorTypeInformation> extends Simple @Override public String toString() { StringBuffer buf = new StringBuffer(super.toString()); + buf.append(",variable"); if (mindim >= 0) { buf.append(",mindim="+mindim); } diff --git a/src/de/lmu/ifi/dbs/elki/database/AbstractDatabase.java b/src/de/lmu/ifi/dbs/elki/database/AbstractDatabase.java index f9582039..e0fc6bf2 100644 --- a/src/de/lmu/ifi/dbs/elki/database/AbstractDatabase.java +++ b/src/de/lmu/ifi/dbs/elki/database/AbstractDatabase.java @@ -23,6 +23,7 @@ package de.lmu.ifi.dbs.elki.database; along with this program. If not, see . */ +import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.List; @@ -56,7 +57,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; /** * Abstract base class for database API implementations. Provides default - * management of relations, indexes and events as well as default query matching. + * management of relations, indexes and events as well as default query + * matching. * * @author Erich Schubert * @@ -137,7 +139,7 @@ public abstract class AbstractDatabase extends AbstractHierarchicalResult implem throw e; } } - + @Override public Collection> getRelations() { return Collections.unmodifiableCollection(relations); @@ -152,15 +154,11 @@ public abstract class AbstractDatabase extends AbstractHierarchicalResult implem return (Relation) relation; } } - if (getLogger().isDebugging()) { - StringBuffer buf = new StringBuffer(); - buf.append("No matching relation for type ").append(restriction.toString()).append(":\n"); - for(Relation relation : relations) { - buf.append(relation.getDataTypeInformation().toString()).append(","); - } - getLogger().debug(buf); + List types = new ArrayList(relations.size()); + for(Relation relation : relations) { + types.add(relation.getDataTypeInformation()); } - throw new NoSupportedDataTypeException(restriction); + throw new NoSupportedDataTypeException(restriction, types); } @Override diff --git a/src/de/lmu/ifi/dbs/elki/database/HashmapDatabase.java b/src/de/lmu/ifi/dbs/elki/database/HashmapDatabase.java index e94bcfb1..a92c4427 100644 --- a/src/de/lmu/ifi/dbs/elki/database/HashmapDatabase.java +++ b/src/de/lmu/ifi/dbs/elki/database/HashmapDatabase.java @@ -31,6 +31,7 @@ import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDFactory; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; @@ -62,7 +63,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * @author Erich Schubert * * @apiviz.landmark - * @apiviz.composedOf TreeSetModifiableDBIDs + * @apiviz.composedOf HashSetModifiableDBIDs */ @Description("Database using an in-memory hashtable and at least providing linear scans.") public class HashmapDatabase extends AbstractDatabase implements UpdatableDatabase, Parameterizable { @@ -246,13 +247,14 @@ public class HashmapDatabase extends AbstractDatabase implements UpdatableDataba MultipleObjectsBundle bundle = new MultipleObjectsBundle(); for(Relation relation : relations) { ArrayList data = new ArrayList(ids.size()); - for(DBID id : ids) { - data.add(relation.get(id)); + for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + data.add(relation.get(iter)); } bundle.appendColumn(relation.getDataTypeInformation(), data); } // remove from db - for(DBID id : ids) { + for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + DBID id = iter.getDBID(); doDelete(id); } // Remove from indexes diff --git a/src/de/lmu/ifi/dbs/elki/database/datastore/DataStore.java b/src/de/lmu/ifi/dbs/elki/database/datastore/DataStore.java index 8d7f73ce..a6d5a704 100644 --- a/src/de/lmu/ifi/dbs/elki/database/datastore/DataStore.java +++ b/src/de/lmu/ifi/dbs/elki/database/datastore/DataStore.java @@ -24,6 +24,7 @@ package de.lmu.ifi.dbs.elki.database.datastore; */ import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.result.Result; /** @@ -40,5 +41,5 @@ public interface DataStore extends Result { * @param id Database ID. * @return Object or {@code null} */ - public T get(DBID id); -} + public T get(DBIDRef id); +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/database/datastore/DataStoreFactory.java b/src/de/lmu/ifi/dbs/elki/database/datastore/DataStoreFactory.java index 65a1265d..2ed9f536 100644 --- a/src/de/lmu/ifi/dbs/elki/database/datastore/DataStoreFactory.java +++ b/src/de/lmu/ifi/dbs/elki/database/datastore/DataStoreFactory.java @@ -90,6 +90,38 @@ public interface DataStoreFactory { */ public WritableDoubleDataStore makeDoubleStorage(DBIDs ids, int hints); + /** + * Make a new storage, to associate the given ids with an object of class + * dataclass. + * + * @param ids DBIDs to store data for + * @param hints Hints for the storage manager + * @param def Default value + * @return new data store + */ + public WritableDoubleDataStore makeDoubleStorage(DBIDs ids, int hints, double def); + + /** + * Make a new storage, to associate the given ids with an object of class + * dataclass. + * + * @param ids DBIDs to store data for + * @param hints Hints for the storage manager + * @return new data store + */ + public WritableIntegerDataStore makeIntegerStorage(DBIDs ids, int hints); + + /** + * Make a new storage, to associate the given ids with an object of class + * dataclass. + * + * @param ids DBIDs to store data for + * @param hints Hints for the storage manager + * @param def Default value + * @return new data store + */ + public WritableIntegerDataStore makeIntegerStorage(DBIDs ids, int hints, int def); + /** * Make a new record storage, to associate the given ids with an object of * class dataclass. diff --git a/src/de/lmu/ifi/dbs/elki/database/datastore/DataStoreIDMap.java b/src/de/lmu/ifi/dbs/elki/database/datastore/DataStoreIDMap.java index ead75709..dada881a 100644 --- a/src/de/lmu/ifi/dbs/elki/database/datastore/DataStoreIDMap.java +++ b/src/de/lmu/ifi/dbs/elki/database/datastore/DataStoreIDMap.java @@ -23,7 +23,7 @@ package de.lmu.ifi.dbs.elki.database.datastore; along with this program. If not, see . */ -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; /** * Interface to map DBIDs to integer record ids for use in storage. @@ -37,5 +37,5 @@ public interface DataStoreIDMap { * @param dbid DBID * @return record id {@code id >= 0} */ - public int map(DBID dbid); + public int map(DBIDRef dbid); } diff --git a/src/de/lmu/ifi/dbs/elki/database/datastore/DataStoreUtil.java b/src/de/lmu/ifi/dbs/elki/database/datastore/DataStoreUtil.java index f299c1e8..a8afeaec 100644 --- a/src/de/lmu/ifi/dbs/elki/database/datastore/DataStoreUtil.java +++ b/src/de/lmu/ifi/dbs/elki/database/datastore/DataStoreUtil.java @@ -58,6 +58,41 @@ public final class DataStoreUtil { return DataStoreFactory.FACTORY.makeDoubleStorage(ids, hints); } + /** + * Make a new storage, to associate the given ids with an object of class dataclass. + * + * @param ids DBIDs to store data for + * @param hints Hints for the storage manager + * @param def Default value + * @return new data store + */ + public static WritableDoubleDataStore makeDoubleStorage(DBIDs ids, int hints, double def) { + return DataStoreFactory.FACTORY.makeDoubleStorage(ids, hints, def); + } + + /** + * Make a new storage, to associate the given ids with an object of class dataclass. + * + * @param ids DBIDs to store data for + * @param hints Hints for the storage manager + * @return new data store + */ + public static WritableIntegerDataStore makeIntegerStorage(DBIDs ids, int hints) { + return DataStoreFactory.FACTORY.makeIntegerStorage(ids, hints); + } + + /** + * Make a new storage, to associate the given ids with an object of class dataclass. + * + * @param ids DBIDs to store data for + * @param hints Hints for the storage manager + * @param def Default value + * @return new data store + */ + public static WritableIntegerDataStore makeIntegerStorage(DBIDs ids, int hints, int def) { + return DataStoreFactory.FACTORY.makeIntegerStorage(ids, hints, def); + } + /** * Make a new record storage, to associate the given ids with an object of class dataclass. * diff --git a/src/de/lmu/ifi/dbs/elki/database/datastore/DoubleDataStore.java b/src/de/lmu/ifi/dbs/elki/database/datastore/DoubleDataStore.java index ee315a0c..3348a246 100644 --- a/src/de/lmu/ifi/dbs/elki/database/datastore/DoubleDataStore.java +++ b/src/de/lmu/ifi/dbs/elki/database/datastore/DoubleDataStore.java @@ -1,7 +1,5 @@ package de.lmu.ifi.dbs.elki.database.datastore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; - /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures @@ -25,14 +23,22 @@ import de.lmu.ifi.dbs.elki.database.ids.DBID; along with this program. If not, see . */ +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; + /** * Double-valued data store (avoids boxing/unboxing). * * @author Erich Schubert */ public interface DoubleDataStore extends DataStore { + /** + * Getter, but using objects. + * + * @deprecated Use {@link #doubleValue} instead, to avoid boxing/unboxing cost. + */ + @Override @Deprecated - public Double get(DBID id); + public Double get(DBIDRef id); /** * Retrieves an object from the storage. @@ -40,5 +46,5 @@ public interface DoubleDataStore extends DataStore { * @param id Database ID. * @return Double value */ - public double doubleValue(DBID id); + public double doubleValue(DBIDRef id); } \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/database/datastore/IntegerDataStore.java b/src/de/lmu/ifi/dbs/elki/database/datastore/IntegerDataStore.java new file mode 100644 index 00000000..e450c11b --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/database/datastore/IntegerDataStore.java @@ -0,0 +1,50 @@ +package de.lmu.ifi.dbs.elki.database.datastore; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ + +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; + +/** + * Integer-valued data store (avoids boxing/unboxing). + * + * @author Erich Schubert + */ +public interface IntegerDataStore extends DataStore { + /** + * Getter, but using objects. + * + * @deprecated Use {@link #intValue} instead, to avoid boxing/unboxing cost. + */ + @Override + @Deprecated + public Integer get(DBIDRef id); + + /** + * Retrieves an object from the storage. + * + * @param id Database ID. + * @return Double value + */ + public int intValue(DBIDRef id); +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/database/datastore/RangeIDMap.java b/src/de/lmu/ifi/dbs/elki/database/datastore/RangeIDMap.java index 7fc0ed7f..e00f9ff8 100644 --- a/src/de/lmu/ifi/dbs/elki/database/datastore/RangeIDMap.java +++ b/src/de/lmu/ifi/dbs/elki/database/datastore/RangeIDMap.java @@ -23,8 +23,8 @@ package de.lmu.ifi.dbs.elki.database.datastore; along with this program. If not, see . */ -import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDRange; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; /** * Mapping a static DBID range to storage IDs. @@ -47,7 +47,7 @@ public class RangeIDMap implements DataStoreIDMap { } @Override - public int map(DBID dbid) { + public int map(DBIDRef dbid) { return range.getOffset(dbid); } } diff --git a/src/de/lmu/ifi/dbs/elki/database/datastore/WritableDataStore.java b/src/de/lmu/ifi/dbs/elki/database/datastore/WritableDataStore.java index 7f1fbf52..93176445 100644 --- a/src/de/lmu/ifi/dbs/elki/database/datastore/WritableDataStore.java +++ b/src/de/lmu/ifi/dbs/elki/database/datastore/WritableDataStore.java @@ -23,7 +23,7 @@ package de.lmu.ifi.dbs.elki.database.datastore; along with this program. If not, see . */ -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; /** * Writable data store. @@ -32,7 +32,7 @@ import de.lmu.ifi.dbs.elki.database.ids.DBID; * * @apiviz.landmark * - * @param + * @param Data type */ public interface WritableDataStore extends DataStore { /** @@ -44,7 +44,7 @@ public interface WritableDataStore extends DataStore { * @param value Value to store. * @return previous value */ - public T put(DBID id, T value); + public T put(DBIDRef id, T value); /** * Deallocate the storage, freeing the memory and notifies the registered @@ -58,5 +58,5 @@ public interface WritableDataStore extends DataStore { * * @param id Database ID. */ - public void delete(DBID id); + public void delete(DBIDRef id); } diff --git a/src/de/lmu/ifi/dbs/elki/database/datastore/WritableDoubleDataStore.java b/src/de/lmu/ifi/dbs/elki/database/datastore/WritableDoubleDataStore.java index 313e4adc..19cc54c7 100644 --- a/src/de/lmu/ifi/dbs/elki/database/datastore/WritableDoubleDataStore.java +++ b/src/de/lmu/ifi/dbs/elki/database/datastore/WritableDoubleDataStore.java @@ -1,7 +1,5 @@ package de.lmu.ifi.dbs.elki.database.datastore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; - /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures @@ -25,15 +23,22 @@ import de.lmu.ifi.dbs.elki.database.ids.DBID; along with this program. If not, see . */ +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; + /** * Data store specialized for doubles. Avoids boxing/unboxing. * * @author Erich Schubert */ public interface WritableDoubleDataStore extends DoubleDataStore, WritableDataStore { + /** + * Setter, but using objects. + * + * @deprecated Use {@link #putDouble} instead, to avoid boxing/unboxing cost. + */ @Override @Deprecated - public Double put(DBID id, Double value); + public Double put(DBIDRef id, Double value); /** * Associates the specified value with the specified id in this storage. If @@ -44,7 +49,7 @@ public interface WritableDoubleDataStore extends DoubleDataStore, WritableDataSt * @param value Value to store. * @return previous value */ - public double putDouble(DBID id, double value); + public double putDouble(DBIDRef id, double value); /** * Associates the specified value with the specified id in this storage. If @@ -55,5 +60,5 @@ public interface WritableDoubleDataStore extends DoubleDataStore, WritableDataSt * @param value Value to store. * @return previous value */ - public double put(DBID id, double value); + public double put(DBIDRef id, double value); } \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/database/datastore/WritableIntegerDataStore.java b/src/de/lmu/ifi/dbs/elki/database/datastore/WritableIntegerDataStore.java new file mode 100644 index 00000000..b8bf1348 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/database/datastore/WritableIntegerDataStore.java @@ -0,0 +1,64 @@ +package de.lmu.ifi.dbs.elki.database.datastore; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ + +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; + +/** + * Data store specialized for doubles. Avoids boxing/unboxing. + * + * @author Erich Schubert + */ +public interface WritableIntegerDataStore extends IntegerDataStore, WritableDataStore { + /** + * Setter, but using objects. + * + * @deprecated Use {@link #putInt} instead, to avoid boxing/unboxing cost. + */ + @Override + @Deprecated + public Integer put(DBIDRef id, Integer value); + + /** + * Associates the specified value with the specified id in this storage. If + * the storage previously contained a value for the id, the previous value is + * replaced by the specified value. + * + * @param id Database ID. + * @param value Value to store. + * @return previous value + */ + public int putInt(DBIDRef id, int value); + + /** + * Associates the specified value with the specified id in this storage. If + * the storage previously contained a value for the id, the previous value is + * replaced by the specified value. + * + * @param id Database ID. + * @param value Value to store. + * @return previous value + */ + public int put(DBIDRef id, int value); +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/database/datastore/WritableRecordStore.java b/src/de/lmu/ifi/dbs/elki/database/datastore/WritableRecordStore.java index 775d201b..0799fdfe 100644 --- a/src/de/lmu/ifi/dbs/elki/database/datastore/WritableRecordStore.java +++ b/src/de/lmu/ifi/dbs/elki/database/datastore/WritableRecordStore.java @@ -23,7 +23,7 @@ package de.lmu.ifi.dbs.elki.database.datastore; along with this program. If not, see . */ -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; /** * Represents a storage which stores multiple values per object in a record fashion. @@ -52,5 +52,5 @@ public interface WritableRecordStore extends RecordStore { * @param id object ID to remove * @return success code */ - public boolean remove(DBID id); + public boolean remove(DBIDRef id); } diff --git a/src/de/lmu/ifi/dbs/elki/database/datastore/memory/ArrayDoubleStore.java b/src/de/lmu/ifi/dbs/elki/database/datastore/memory/ArrayDoubleStore.java index 433547a5..de22a6b3 100644 --- a/src/de/lmu/ifi/dbs/elki/database/datastore/memory/ArrayDoubleStore.java +++ b/src/de/lmu/ifi/dbs/elki/database/datastore/memory/ArrayDoubleStore.java @@ -23,9 +23,11 @@ package de.lmu.ifi.dbs.elki.database.datastore.memory; along with this program. If not, see . */ +import java.util.Arrays; + import de.lmu.ifi.dbs.elki.database.datastore.DataStoreIDMap; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; /** * A class to answer representation queries using the stored Array. @@ -52,14 +54,28 @@ public class ArrayDoubleStore implements WritableDoubleDataStore { * @param idmap ID map */ public ArrayDoubleStore(int size, DataStoreIDMap idmap) { + this(size, idmap, Double.NaN); + } + + /** + * Constructor. + * + * @param size Size + * @param idmap ID map + * @param def Default value + */ + public ArrayDoubleStore(int size, DataStoreIDMap idmap, double def) { super(); this.data = new double[size]; + if(def != 0) { + Arrays.fill(this.data, def); + } this.idmap = idmap; } @Override @Deprecated - public Double get(DBID id) { + public Double get(DBIDRef id) { try { return data[idmap.map(id)]; } @@ -70,20 +86,20 @@ public class ArrayDoubleStore implements WritableDoubleDataStore { @Override @Deprecated - public Double put(DBID id, Double value) { + public Double put(DBIDRef id, Double value) { final int off = idmap.map(id); double ret = data[off]; data[off] = value; return ret; } - + @Override - public double doubleValue(DBID id) { + public double doubleValue(DBIDRef id) { return data[idmap.map(id)]; } @Override - public double putDouble(DBID id, double value) { + public double putDouble(DBIDRef id, double value) { final int off = idmap.map(id); final double ret = data[off]; data[off] = value; @@ -91,7 +107,7 @@ public class ArrayDoubleStore implements WritableDoubleDataStore { } @Override - public double put(DBID id, double value) { + public double put(DBIDRef id, double value) { final int off = idmap.map(id); final double ret = data[off]; data[off] = value; @@ -105,7 +121,7 @@ public class ArrayDoubleStore implements WritableDoubleDataStore { } @Override - public void delete(DBID id) { + public void delete(DBIDRef id) { throw new UnsupportedOperationException("Can't delete from a static array storage."); } diff --git a/src/de/lmu/ifi/dbs/elki/database/datastore/memory/ArrayIntegerStore.java b/src/de/lmu/ifi/dbs/elki/database/datastore/memory/ArrayIntegerStore.java new file mode 100644 index 00000000..8caa7ec3 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/database/datastore/memory/ArrayIntegerStore.java @@ -0,0 +1,137 @@ +package de.lmu.ifi.dbs.elki.database.datastore.memory; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ + +import java.util.Arrays; + +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreIDMap; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; + +/** + * A class to answer representation queries using the stored Array. + * + * @author Erich Schubert + * + * @apiviz.composedOf de.lmu.ifi.dbs.elki.database.datastore.DataStoreIDMap + */ +public class ArrayIntegerStore implements WritableIntegerDataStore { + /** + * Data array + */ + private int[] data; + + /** + * DBID to index map + */ + private DataStoreIDMap idmap; + + /** + * Constructor. + * + * @param size Size + * @param idmap ID map + */ + public ArrayIntegerStore(int size, DataStoreIDMap idmap) { + this(size, idmap, 0); + } + + /** + * Constructor. + * + * @param size Size + * @param idmap ID map + * @param def Default value + */ + public ArrayIntegerStore(int size, DataStoreIDMap idmap, int def) { + super(); + this.data = new int[size]; + if (def != 0) { + Arrays.fill(this.data, def); + } + this.idmap = idmap; + } + + @Override + @Deprecated + public Integer get(DBIDRef id) { + try { + return data[idmap.map(id)]; + } + catch(ArrayIndexOutOfBoundsException e) { + return null; + } + } + + @Override + @Deprecated + public Integer put(DBIDRef id, Integer value) { + final int off = idmap.map(id); + int ret = data[off]; + data[off] = value; + return ret; + } + + @Override + public int intValue(DBIDRef id) { + return data[idmap.map(id)]; + } + + @Override + public int putInt(DBIDRef id, int value) { + final int off = idmap.map(id); + final int ret = data[off]; + data[off] = value; + return ret; + } + + @Override + public int put(DBIDRef id, int value) { + final int off = idmap.map(id); + final int ret = data[off]; + data[off] = value; + return ret; + } + + @Override + public void destroy() { + data = null; + idmap = null; + } + + @Override + public void delete(DBIDRef id) { + throw new UnsupportedOperationException("Can't delete from a static array storage."); + } + + @Override + public String getLongName() { + return "raw"; + } + + @Override + public String getShortName() { + return "raw"; + } +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/database/datastore/memory/ArrayRecordStore.java b/src/de/lmu/ifi/dbs/elki/database/datastore/memory/ArrayRecordStore.java index 7be68c97..6e578b61 100644 --- a/src/de/lmu/ifi/dbs/elki/database/datastore/memory/ArrayRecordStore.java +++ b/src/de/lmu/ifi/dbs/elki/database/datastore/memory/ArrayRecordStore.java @@ -26,7 +26,7 @@ package de.lmu.ifi.dbs.elki.database.datastore.memory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreIDMap; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; import de.lmu.ifi.dbs.elki.database.datastore.WritableRecordStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; /** * A class to answer representation queries using the stored Array. @@ -73,7 +73,7 @@ public class ArrayRecordStore implements WritableRecordStore { * @return current value */ @SuppressWarnings("unchecked") - protected T get(DBID id, int index) { + protected T get(DBIDRef id, int index) { try { return (T) data[idmap.map(id)][index]; } @@ -97,7 +97,7 @@ public class ArrayRecordStore implements WritableRecordStore { * @return old value */ @SuppressWarnings("unchecked") - protected T set(DBID id, int index, T value) { + protected T set(DBIDRef id, int index, T value) { T ret = (T) data[idmap.map(id)][index]; data[idmap.map(id)][index] = value; return ret; @@ -128,12 +128,12 @@ public class ArrayRecordStore implements WritableRecordStore { @SuppressWarnings("unchecked") @Override - public T get(DBID id) { + public T get(DBIDRef id) { return (T) ArrayRecordStore.this.get(id, index); } @Override - public T put(DBID id, T value) { + public T put(DBIDRef id, T value) { return ArrayRecordStore.this.set(id, index, value); } @@ -143,7 +143,7 @@ public class ArrayRecordStore implements WritableRecordStore { } @Override - public void delete(DBID id) { + public void delete(DBIDRef id) { throw new UnsupportedOperationException("ArrayStore record values cannot be deleted."); } @@ -159,7 +159,7 @@ public class ArrayRecordStore implements WritableRecordStore { } @Override - public boolean remove(DBID id) { + public boolean remove(DBIDRef id) { throw new UnsupportedOperationException("ArrayStore records cannot be removed."); } } \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/database/datastore/memory/ArrayStore.java b/src/de/lmu/ifi/dbs/elki/database/datastore/memory/ArrayStore.java index a41a444d..a7ce310b 100644 --- a/src/de/lmu/ifi/dbs/elki/database/datastore/memory/ArrayStore.java +++ b/src/de/lmu/ifi/dbs/elki/database/datastore/memory/ArrayStore.java @@ -25,7 +25,7 @@ package de.lmu.ifi.dbs.elki.database.datastore.memory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreIDMap; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; /** * A class to answer representation queries using the stored Array. @@ -58,7 +58,7 @@ public class ArrayStore implements WritableDataStore { @SuppressWarnings("unchecked") @Override - public T get(DBID id) { + public T get(DBIDRef id) { try { return (T) data[idmap.map(id)]; } @@ -74,7 +74,7 @@ public class ArrayStore implements WritableDataStore { } @Override - public T put(DBID id, T value) { + public T put(DBIDRef id, T value) { T ret = get(id); data[idmap.map(id)] = value; return ret; @@ -87,7 +87,7 @@ public class ArrayStore implements WritableDataStore { } @Override - public void delete(DBID id) { + public void delete(DBIDRef id) { throw new UnsupportedOperationException("Can't delete from a static array storage."); } diff --git a/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapIntegerDBIDDoubleStore.java b/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapIntegerDBIDDoubleStore.java index ae06dc00..f9f8d48a 100644 --- a/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapIntegerDBIDDoubleStore.java +++ b/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapIntegerDBIDDoubleStore.java @@ -1,4 +1,5 @@ package de.lmu.ifi.dbs.elki.database.datastore.memory; + /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures @@ -25,7 +26,7 @@ package de.lmu.ifi.dbs.elki.database.datastore.memory; import gnu.trove.map.TIntDoubleMap; import gnu.trove.map.hash.TIntDoubleHashMap; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; /** * Writable data store for double values. @@ -37,25 +38,35 @@ public class MapIntegerDBIDDoubleStore implements WritableDoubleDataStore { * Data storage */ private TIntDoubleMap map; - + /** * Constructor. - * + * * @param size Expected size */ public MapIntegerDBIDDoubleStore(int size) { + this(size, Double.NaN); + } + + /** + * Constructor. + * + * @param size Expected size + * @param def Default value + */ + public MapIntegerDBIDDoubleStore(int size, double def) { super(); - map = new TIntDoubleHashMap(size, 0.5f, Integer.MIN_VALUE, Double.NaN); + map = new TIntDoubleHashMap(size, 0.5f, Integer.MIN_VALUE, def); } @Override @Deprecated - public Double get(DBID id) { + public Double get(DBIDRef id) { return map.get(id.getIntegerID()); } @Override - public double doubleValue(DBID id) { + public double doubleValue(DBIDRef id) { return map.get(id.getIntegerID()); } @@ -71,7 +82,7 @@ public class MapIntegerDBIDDoubleStore implements WritableDoubleDataStore { @Override @Deprecated - public Double put(DBID id, Double value) { + public Double put(DBIDRef id, Double value) { return map.put(id.getIntegerID(), value); } @@ -82,17 +93,17 @@ public class MapIntegerDBIDDoubleStore implements WritableDoubleDataStore { } @Override - public void delete(DBID id) { + public void delete(DBIDRef id) { map.remove(id.getIntegerID()); } @Override - public double putDouble(DBID id, double value) { + public double putDouble(DBIDRef id, double value) { return map.put(id.getIntegerID(), value); } @Override - public double put(DBID id, double value) { + public double put(DBIDRef id, double value) { return map.put(id.getIntegerID(), value); } } diff --git a/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapIntegerDBIDIntegerStore.java b/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapIntegerDBIDIntegerStore.java new file mode 100644 index 00000000..f7aea633 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapIntegerDBIDIntegerStore.java @@ -0,0 +1,109 @@ +package de.lmu.ifi.dbs.elki.database.datastore.memory; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ + +import gnu.trove.map.TIntIntMap; +import gnu.trove.map.hash.TIntIntHashMap; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; + +/** + * Writable data store for double values. + * + * @author Erich Schubert + */ +public class MapIntegerDBIDIntegerStore implements WritableIntegerDataStore { + /** + * Data storage + */ + private TIntIntMap map; + + /** + * Constructor. + * + * @param size Expected size + */ + public MapIntegerDBIDIntegerStore(int size) { + this(size, 0); + } + + /** + * Constructor. + * + * @param size Expected size + * @param def Default value + */ + public MapIntegerDBIDIntegerStore(int size, int def) { + super(); + map = new TIntIntHashMap(size, 0.5f, Integer.MIN_VALUE, def); + } + + @Override + @Deprecated + public Integer get(DBIDRef id) { + return map.get(id.getIntegerID()); + } + + @Override + public int intValue(DBIDRef id) { + return map.get(id.getIntegerID()); + } + + @Override + public String getLongName() { + return "raw"; + } + + @Override + public String getShortName() { + return "raw"; + } + + @Override + @Deprecated + public Integer put(DBIDRef id, Integer value) { + return map.put(id.getIntegerID(), value); + } + + @Override + public void destroy() { + map.clear(); + map = null; + } + + @Override + public void delete(DBIDRef id) { + map.remove(id.getIntegerID()); + } + + @Override + public int putInt(DBIDRef id, int value) { + return map.put(id.getIntegerID(), value); + } + + @Override + public int put(DBIDRef id, int value) { + return map.put(id.getIntegerID(), value); + } +} diff --git a/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapIntegerDBIDRecordStore.java b/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapIntegerDBIDRecordStore.java index 8272fb2e..805c6de3 100644 --- a/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapIntegerDBIDRecordStore.java +++ b/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapIntegerDBIDRecordStore.java @@ -27,7 +27,7 @@ import gnu.trove.map.TIntObjectMap; import gnu.trove.map.hash.TIntObjectHashMap; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; import de.lmu.ifi.dbs.elki.database.datastore.WritableRecordStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; /** * A class to answer representation queries using a map and an index within the @@ -93,7 +93,7 @@ public class MapIntegerDBIDRecordStore implements WritableRecordStore { * @return current value */ @SuppressWarnings("unchecked") - protected T get(DBID id, int index) { + protected T get(DBIDRef id, int index) { Object[] d = data.get(id.getIntegerID()); if(d == null) { return null; @@ -118,7 +118,7 @@ public class MapIntegerDBIDRecordStore implements WritableRecordStore { * @return previous value */ @SuppressWarnings("unchecked") - protected T set(DBID id, int index, T value) { + protected T set(DBIDRef id, int index, T value) { Object[] d = data.get(id.getIntegerID()); if(d == null) { d = new Object[rlen]; @@ -154,12 +154,12 @@ public class MapIntegerDBIDRecordStore implements WritableRecordStore { @SuppressWarnings("unchecked") @Override - public T get(DBID id) { + public T get(DBIDRef id) { return (T) MapIntegerDBIDRecordStore.this.get(id, index); } @Override - public T put(DBID id, T value) { + public T put(DBIDRef id, T value) { return MapIntegerDBIDRecordStore.this.set(id, index, value); } @@ -169,7 +169,7 @@ public class MapIntegerDBIDRecordStore implements WritableRecordStore { } @Override - public void delete(DBID id) { + public void delete(DBIDRef id) { throw new UnsupportedOperationException("Record storage values cannot be deleted."); } @@ -185,7 +185,7 @@ public class MapIntegerDBIDRecordStore implements WritableRecordStore { } @Override - public boolean remove(DBID id) { + public boolean remove(DBIDRef id) { return data.remove(id.getIntegerID()) != null; } } \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapIntegerDBIDStore.java b/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapIntegerDBIDStore.java index 4deb929d..e04027d0 100644 --- a/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapIntegerDBIDStore.java +++ b/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapIntegerDBIDStore.java @@ -26,7 +26,7 @@ package de.lmu.ifi.dbs.elki.database.datastore.memory; import gnu.trove.map.TIntObjectMap; import gnu.trove.map.hash.TIntObjectHashMap; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; /** * A class to answer representation queries using a map. Basically, it is just a @@ -70,12 +70,12 @@ public class MapIntegerDBIDStore implements WritableDataStore { } @Override - public T get(DBID id) { + public T get(DBIDRef id) { return data.get(id.getIntegerID()); } @Override - public T put(DBID id, T value) { + public T put(DBIDRef id, T value) { if(value == null) { return data.remove(id.getIntegerID()); } @@ -88,7 +88,7 @@ public class MapIntegerDBIDStore implements WritableDataStore { } @Override - public void delete(DBID id) { + public void delete(DBIDRef id) { data.remove(id.getIntegerID()); } diff --git a/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapRecordStore.java b/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapRecordStore.java index 5a98966f..05cf3697 100644 --- a/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapRecordStore.java +++ b/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapRecordStore.java @@ -29,6 +29,7 @@ import java.util.concurrent.ConcurrentHashMap; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; import de.lmu.ifi.dbs.elki.database.datastore.WritableRecordStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; /** * A class to answer representation queries using a map and an index within the @@ -47,6 +48,7 @@ public class MapRecordStore implements WritableRecordStore { /** * Storage Map */ + // TODO: Use trove maps? private final Map data; /** @@ -84,8 +86,8 @@ public class MapRecordStore implements WritableRecordStore { * @return current value */ @SuppressWarnings("unchecked") - protected T get(DBID id, int index) { - Object[] d = data.get(id); + protected T get(DBIDRef id, int index) { + Object[] d = data.get(id.getDBID()); if(d == null) { return null; } @@ -109,11 +111,11 @@ public class MapRecordStore implements WritableRecordStore { * @return previous value */ @SuppressWarnings("unchecked") - protected T set(DBID id, int index, T value) { - Object[] d = data.get(id); + protected T set(DBIDRef id, int index, T value) { + Object[] d = data.get(id.getDBID()); if(d == null) { d = new Object[rlen]; - data.put(id, d); + data.put(id.getDBID(), d); } T ret = (T) d[index]; d[index] = value; @@ -145,12 +147,12 @@ public class MapRecordStore implements WritableRecordStore { @SuppressWarnings("unchecked") @Override - public T get(DBID id) { + public T get(DBIDRef id) { return (T) MapRecordStore.this.get(id, index); } @Override - public T put(DBID id, T value) { + public T put(DBIDRef id, T value) { return MapRecordStore.this.set(id, index, value); } @@ -160,7 +162,7 @@ public class MapRecordStore implements WritableRecordStore { } @Override - public void delete(DBID id) { + public void delete(DBIDRef id) { throw new UnsupportedOperationException("Record storage values cannot be deleted."); } @@ -176,7 +178,7 @@ public class MapRecordStore implements WritableRecordStore { } @Override - public boolean remove(DBID id) { + public boolean remove(DBIDRef id) { return data.remove(id) != null; } diff --git a/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapStore.java b/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapStore.java index 27cd9f63..90742993 100644 --- a/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapStore.java +++ b/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MapStore.java @@ -28,6 +28,7 @@ import java.util.Map; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; /** * A class to answer representation queries using a map. Basically, it is just a @@ -41,6 +42,7 @@ public class MapStore implements WritableDataStore { /** * Storage Map */ + // TODO: use trove maps? private Map data; /** @@ -62,16 +64,16 @@ public class MapStore implements WritableDataStore { } @Override - public T get(DBID id) { - return data.get(id); + public T get(DBIDRef id) { + return data.get(id.getDBID()); } @Override - public T put(DBID id, T value) { + public T put(DBIDRef id, T value) { if(value == null) { - return data.remove(id); + return data.remove(id.getDBID()); } - return data.put(id, value); + return data.put(id.getDBID(), value); } @Override @@ -80,7 +82,7 @@ public class MapStore implements WritableDataStore { } @Override - public void delete(DBID id) { + public void delete(DBIDRef id) { data.remove(id); } diff --git a/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MemoryDataStoreFactory.java b/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MemoryDataStoreFactory.java index 3e3ce017..683e4c5d 100644 --- a/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MemoryDataStoreFactory.java +++ b/src/de/lmu/ifi/dbs/elki/database/datastore/memory/MemoryDataStoreFactory.java @@ -27,6 +27,7 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.RangeIDMap; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; import de.lmu.ifi.dbs.elki.database.datastore.WritableRecordStore; import de.lmu.ifi.dbs.elki.database.ids.DBIDRange; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; @@ -47,8 +48,15 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDs; * @apiviz.uses MapRecordStore oneway - - «create» */ public class MemoryDataStoreFactory implements DataStoreFactory { + @SuppressWarnings("unchecked") @Override public WritableDataStore makeStorage(DBIDs ids, int hints, Class dataclass) { + if (Double.class.equals(dataclass)) { + return (WritableDataStore) makeDoubleStorage(ids, hints); + } + if (Integer.class.equals(dataclass)) { + return (WritableDataStore) makeIntegerStorage(ids, hints); + } if(ids instanceof DBIDRange) { DBIDRange range = (DBIDRange) ids; Object[] data = new Object[range.size()]; @@ -70,6 +78,39 @@ public class MemoryDataStoreFactory implements DataStoreFactory { } } + @Override + public WritableDoubleDataStore makeDoubleStorage(DBIDs ids, int hints, double def) { + if(ids instanceof DBIDRange) { + DBIDRange range = (DBIDRange) ids; + return new ArrayDoubleStore(range.size(), new RangeIDMap(range), def); + } + else { + return new MapIntegerDBIDDoubleStore(ids.size(), def); + } + } + + @Override + public WritableIntegerDataStore makeIntegerStorage(DBIDs ids, int hints) { + if(ids instanceof DBIDRange) { + DBIDRange range = (DBIDRange) ids; + return new ArrayIntegerStore(range.size(), new RangeIDMap(range)); + } + else { + return new MapIntegerDBIDIntegerStore(ids.size()); + } + } + + @Override + public WritableIntegerDataStore makeIntegerStorage(DBIDs ids, int hints, int def) { + if(ids instanceof DBIDRange) { + DBIDRange range = (DBIDRange) ids; + return new ArrayIntegerStore(range.size(), new RangeIDMap(range), def); + } + else { + return new MapIntegerDBIDIntegerStore(ids.size(), def); + } + } + @Override public WritableRecordStore makeRecordStorage(DBIDs ids, int hints, Class... dataclasses) { if(ids instanceof DBIDRange) { diff --git a/src/de/lmu/ifi/dbs/elki/database/ids/ArrayDBIDs.java b/src/de/lmu/ifi/dbs/elki/database/ids/ArrayDBIDs.java index 13d6ea58..68bbb83d 100644 --- a/src/de/lmu/ifi/dbs/elki/database/ids/ArrayDBIDs.java +++ b/src/de/lmu/ifi/dbs/elki/database/ids/ArrayDBIDs.java @@ -42,6 +42,7 @@ public interface ArrayDBIDs extends DBIDs { * * @return Iterator */ + @Override public DBIDIter iter(); /** @@ -49,6 +50,7 @@ public interface ArrayDBIDs extends DBIDs { * * @return size */ + @Override public int size(); /** @@ -61,5 +63,5 @@ public interface ArrayDBIDs extends DBIDs { * @param key Key to search for * @return Offset of key */ - public int binarySearch(DBID key); + public int binarySearch(DBIDRef key); } diff --git a/src/de/lmu/ifi/dbs/elki/database/ids/ArrayModifiableDBIDs.java b/src/de/lmu/ifi/dbs/elki/database/ids/ArrayModifiableDBIDs.java index e9a6c8e0..95bcc2f7 100644 --- a/src/de/lmu/ifi/dbs/elki/database/ids/ArrayModifiableDBIDs.java +++ b/src/de/lmu/ifi/dbs/elki/database/ids/ArrayModifiableDBIDs.java @@ -59,4 +59,12 @@ public interface ArrayModifiableDBIDs extends ModifiableDBIDs, ArrayDBIDs { * @return previous value */ public DBID set(int i, DBID newval); + + /** + * Swap DBIDs add positions a and b. + * + * @param a First position + * @param b Second position + */ + public void swap(int a, int b); } \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/database/ids/DBID.java b/src/de/lmu/ifi/dbs/elki/database/ids/DBID.java index 1391b75b..8d98893d 100644 --- a/src/de/lmu/ifi/dbs/elki/database/ids/DBID.java +++ b/src/de/lmu/ifi/dbs/elki/database/ids/DBID.java @@ -37,11 +37,67 @@ package de.lmu.ifi.dbs.elki.database.ids; * * @apiviz.landmark */ -public interface DBID extends Comparable, ArrayDBIDs { +public interface DBID extends DBIDRef, Comparable, ArrayDBIDs { /** - * Return the integer value of the object ID, if possible. + * Compare the current value of two referenced DBIDs. * - * @return integer id + * @param other Other DBID reference (or DBID) + * @return {@code true} when the references currently refer to the same. */ - public int getIntegerID(); + @Override + public boolean sameDBID(DBIDRef other); + + /** + * Compare two objects by the value of the referenced DBID. + * + * @param other Other DBID or object + * @return -1, 0 or +1 + */ + @Override + public int compareDBID(DBIDRef other); + + /** + * In contrast to {@link DBIDRef}, the DBID interface is supposed to have a + * stable hash code. However, it is generally preferred to use optimized + * storage classes instead of Java collections! + * + * @return hash code + */ + @Override + public int hashCode(); + + /** + * In contrast to {@link DBIDRef}, the DBID interface is supposed to have a + * stable equals for other DBIDs. + * + * Yet, {@link #sameDBID} is more type safe and explicit. + * + * @return true when the object is the same DBID. + */ + @Override + public boolean equals(Object obj); + + /** + * Part of the DBIDRef API, this must return {@code this} for an + * actual DBID. + * + * @return {@code this} + * @deprecated When the object is known to be a DBID, the usage of this method + * is pointless, therefore it is marked as deprecated to cause a + * warning. + */ + @Deprecated + @Override + public DBID getDBID(); + + /** + * Compare two DBIDs for ordering. + * + * Consider using {@link #compareDBID}, which is more explicit. + * + * @param other Other DBID object + * @return Comparison result + */ + @Override + public int compareTo(DBIDRef other); } \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/database/ids/DBIDFactory.java b/src/de/lmu/ifi/dbs/elki/database/ids/DBIDFactory.java index f35f390b..6063508b 100644 --- a/src/de/lmu/ifi/dbs/elki/database/ids/DBIDFactory.java +++ b/src/de/lmu/ifi/dbs/elki/database/ids/DBIDFactory.java @@ -40,7 +40,7 @@ import de.lmu.ifi.dbs.elki.persistent.FixedSizeByteBufferSerializer; * @apiviz.uses DBIDRange oneway - - «create» * @apiviz.uses ArrayModifiableDBIDs oneway - - «create» * @apiviz.uses HashSetModifiableDBIDs oneway - - «create» - * @apiviz.uses TreeSetModifiableDBIDs oneway - - «create» + * @apiviz.uses HashSetModifiableDBIDs oneway - - «create» * @apiviz.has ByteBufferSerializer oneway - - provides */ public interface DBIDFactory { @@ -89,12 +89,12 @@ public interface DBIDFactory { /** * Make a DBID pair from two existing DBIDs. * - * @param first first DBID - * @param second second DBID + * @param id1 first DBID + * @param id2 second DBID * * @return new pair. */ - public DBIDPair makePair(DBID first, DBID second); + public DBIDPair makePair(DBIDRef id1, DBIDRef id2); /** * Make a new (modifiable) array of DBIDs. diff --git a/src/de/lmu/ifi/dbs/elki/database/ids/DBIDIter.java b/src/de/lmu/ifi/dbs/elki/database/ids/DBIDIter.java index a41284f4..f2d0ae91 100644 --- a/src/de/lmu/ifi/dbs/elki/database/ids/DBIDIter.java +++ b/src/de/lmu/ifi/dbs/elki/database/ids/DBIDIter.java @@ -22,6 +22,9 @@ package de.lmu.ifi.dbs.elki.database.ids; You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ + +import de.lmu.ifi.dbs.elki.utilities.iterator.Iter; + /** * Iterator for DBIDs. * @@ -30,11 +33,15 @@ package de.lmu.ifi.dbs.elki.database.ids; * with Java, but at the same time, the syntax is much more compatible with for * loops. * - * Usage example:
{@code + * Usage example: + * + *
+ * {@code 
  * for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
  *   iter.getDBID();
  * }
- * }
+ * } + * * * We list some fundamental differences. *