package de.lmu.ifi.dbs.elki.algorithm.outlier.clustering; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import java.util.List; import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeans; import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeansLloyd; import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.model.ModelUtil; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.DoubleRelation; import de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta; import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** * Outlier detection by using k-means clustering. * * The scores are assigned by the objects distance to the nearest center. * * We don't have a clear reference for this approach, but it seems to be a best * practise in some areas to remove objects that have the largest distance from * their center. If you need to cite this approach, please cite the ELKI version * you used (use the ELKI * publication list for citation information and BibTeX templates). * * @author Erich Schubert * * @apiviz.has KMeans * * @param Object type */ public class KMeansOutlierDetection extends AbstractAlgorithm implements OutlierAlgorithm { /** * Class logger. */ private static final Logging LOG = Logging.getLogger(KMeansOutlierDetection.class); /** * Clustering algorithm to use */ KMeans clusterer; /** * Constructor. * * @param clusterer Clustering algorithm */ public KMeansOutlierDetection(KMeans clusterer) { super(); this.clusterer = clusterer; } /** * Run the outlier detection algorithm. * * @param database Database * @param relation Relation * @return Outlier detection result */ public OutlierResult run(Database database, Relation relation) { DistanceFunction df = clusterer.getDistanceFunction(); DistanceQuery dq = database.getDistanceQuery(relation, df); // TODO: improve ELKI api to ensure we're using the same DBIDs! Clustering c = clusterer.run(database, relation); WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_DB); DoubleMinMax mm = new DoubleMinMax(); @SuppressWarnings("unchecked") NumberVector.Factory factory = (NumberVector.Factory) RelationUtil.assumeVectorField(relation).getFactory(); List> clusters = c.getAllClusters(); for(Cluster cluster : clusters) { // FIXME: use a primitive distance function on number vectors instead. O mean = factory.newNumberVector(ModelUtil.getPrototype(cluster.getModel(), relation)); for(DBIDIter iter = cluster.getIDs().iter(); iter.valid(); iter.advance()) { double dist = dq.distance(mean, iter); scores.put(iter, dist); mm.put(dist); } } // Build result representation. DoubleRelation scoreResult = new MaterializedDoubleRelation("KMeans outlier scores", "kmeans-outlier", scores, relation.getDBIDs()); OutlierScoreMeta scoreMeta = new BasicOutlierScoreMeta(mm.getMin(), mm.getMax(), 0., Double.POSITIVE_INFINITY, 0.); return new OutlierResult(scoreMeta, scoreResult); } @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(clusterer.getDistanceFunction().getInputTypeRestriction()); } @Override protected Logging getLogger() { return LOG; } /** * Parameterizer. * * @author Erich Schubert * * @apiviz.exclude * * @param Object type */ public static class Parameterizer extends AbstractParameterizer { /** * Parameter for choosing the clustering algorithm. */ public static final OptionID CLUSTERING_ID = new OptionID("kmeans.algorithm", // "Clustering algorithm to use for detecting outliers."); /** * Clustering algorithm to use */ KMeans clusterer; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); ObjectParameter> clusterP = new ObjectParameter<>(CLUSTERING_ID, KMeans.class, KMeansLloyd.class); if(config.grab(clusterP)) { clusterer = clusterP.instantiateClass(config); } } @Override protected KMeansOutlierDetection makeInstance() { return new KMeansOutlierDetection<>(clusterer); } } }