package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2015 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import java.util.ArrayList; import java.util.List; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDVar; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunction; import de.lmu.ifi.dbs.elki.math.random.RandomFactory; /** * K-Means initialization by repeatedly choosing the farthest point (by the * sum of distances to previous objects). * * Note: this is less random than other initializations, so running multiple * times will be more likely to return the same local minima. * * @author Erich Schubert * * @param Object type for kmedoids and kmedians */ public class FarthestSumPointsInitialMeans extends FarthestPointsInitialMeans { /** * Constructor. * * @param rnd Random generator. * @param dropfirst Flag to discard the first vector. */ public FarthestSumPointsInitialMeans(RandomFactory rnd, boolean dropfirst) { super(rnd, dropfirst); } @Override public List chooseInitialMeans(Database database, Relation relation, int k, NumberVectorDistanceFunction distanceFunction, NumberVector.Factory factory) { // Get a distance query DistanceQuery distQ = database.getDistanceQuery(relation, distanceFunction); DBIDs ids = relation.getDBIDs(); WritableDoubleDataStore store = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, 0.); // Chose first mean List means = new ArrayList<>(k); DBIDRef first = DBIDUtil.randomSample(ids, rnd); T prevmean = relation.get(first); means.add(factory.newNumberVector(prevmean)); // Find farthest object each. DBIDVar best = DBIDUtil.newVar(first); for(int i = (dropfirst ? 0 : 1); i < k; i++) { double maxdist = Double.NEGATIVE_INFINITY; for(DBIDIter it = ids.iter(); it.valid(); it.advance()) { final double prev = store.doubleValue(it); if(prev != prev) { continue; // NaN: already chosen! } double dsum = prev + distQ.distance(prevmean, it); // Don't store distance to first mean, when it will be dropped below. if(i > 0) { store.putDouble(it, dsum); } if(dsum > maxdist) { maxdist = dsum; best.set(it); } } // Add new mean (and drop the initial mean when desired) if(i == 0) { means.clear(); // Remove temporary first element. } store.putDouble(best, Double.NaN); // So it won't be chosen twice. prevmean = relation.get(best); means.add(factory.newNumberVector(prevmean)); } // Explicitly destroy temporary data. store.destroy(); return means; } @Override public DBIDs chooseInitialMedoids(int k, DBIDs ids, DistanceQuery distQ) { @SuppressWarnings("unchecked") final Relation relation = (Relation) distQ.getRelation(); WritableDoubleDataStore store = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, 0.); ArrayModifiableDBIDs means = DBIDUtil.newArray(k); DBIDRef first = DBIDUtil.randomSample(ids, rnd); means.add(first); DBIDVar prevmean = DBIDUtil.newVar(first); DBIDVar best = DBIDUtil.newVar(first); for(int i = (dropfirst ? 0 : 1); i < k; i++) { // Find farthest object: double maxdist = Double.NEGATIVE_INFINITY; for(DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) { final double prev = store.doubleValue(it); if(prev != prev) { continue; // NaN: already chosen! } double dsum = prev + distQ.distance(prevmean, it); // Don't store distance to first mean, when it will be dropped below. if(i > 0) { store.putDouble(it, dsum); } if(dsum > maxdist) { maxdist = dsum; best.set(it); } } // Add new mean: if(i == 0) { means.clear(); // Remove temporary first element. } store.putDouble(best, Double.NaN); // So it won't be chosen twice. prevmean.set(best); means.add(best); } store.destroy(); return means; } /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ public static class Parameterizer extends FarthestPointsInitialMeans.Parameterizer { /** * Flag for discarding the first object chosen. */ protected boolean keepfirst = false; @Override protected FarthestSumPointsInitialMeans makeInstance() { return new FarthestSumPointsInitialMeans<>(rnd, !keepfirst); } } }