package de.lmu.ifi.dbs.elki.utilities.referencepoints; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.logging.LoggingUtil; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; /** * Random-Sampling strategy for picking reference points. * * @author Erich Schubert * * @param Vector type */ public class RandomSampleReferencePoints> implements ReferencePointsHeuristic { // TODO: use reproducible Random /** * Parameter to specify the sample size. *

* Key: {@code -sample.n} *

*/ public static final OptionID N_ID = new OptionID("sample.n", "The number of samples to draw."); /** * Constant used in choosing optimal table sizes. */ private static final double LOG4 = Math.log(4); /** * Holds the value of {@link #N_ID}. */ private int samplesize; /** * Constructor. * * @param samplesize Sampling size */ public RandomSampleReferencePoints(int samplesize) { super(); this.samplesize = samplesize; } @Override public Collection getReferencePoints(Relation db) { if(samplesize >= db.size()) { LoggingUtil.warning("Sample size is larger than database size!"); ArrayList selection = new ArrayList<>(db.size()); for(DBIDIter iditer = db.iterDBIDs(); iditer.valid(); iditer.advance()) { selection.add(db.get(iditer)); } return selection; } ArrayList result = new ArrayList<>(samplesize); int dbsize = db.size(); // Guess the memory requirements of a hashmap. // The values are based on Python code, and might need modification for // Java! // If the hashmap is likely to become too big, lazy-shuffle a list instead. int setsize = 21; if(samplesize > 5) { setsize += 2 << (int) Math.ceil(Math.log(samplesize * 3) / LOG4); } // logger.debug("Setsize: "+setsize); ArrayDBIDs ids = DBIDUtil.ensureArray(db.getDBIDs()); boolean fastrandomaccess = false; if(ArrayList.class.isAssignableFrom(ids.getClass())) { fastrandomaccess = true; } if(samplesize <= setsize || !fastrandomaccess) { // use pool approach // if getIDs() is an array list, we don't need to copy it again. ArrayModifiableDBIDs pool = ((ArrayModifiableDBIDs.class.isAssignableFrom(ids.getClass())) ? (ArrayModifiableDBIDs) ids : DBIDUtil.newArray(ids)); for(int i = 0; i < samplesize; i++) { int j = (int) Math.floor(Math.random() * (dbsize - i)); result.add(db.get(pool.get(j))); pool.set(j, pool.get(dbsize - i - 1)); } ids = null; // dirty! } else { HashSet selected = new HashSet<>(); for(int i = 0; i < samplesize; i++) { int j = (int) Math.floor(Math.random() * dbsize); // Redraw from pool. while(selected.contains(j)) { j = (int) Math.floor(Math.random() * dbsize); } selected.add(j); result.add(db.get(ids.get(j))); } } assert (result.size() == samplesize); return result; } /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ public static class Parameterizer> extends AbstractParameterizer { /** * Holds the value of {@link #N_ID}. */ protected int samplesize; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); IntParameter samplesizeP = new IntParameter(N_ID); samplesizeP.addConstraint(new GreaterConstraint(0)); if(config.grab(samplesizeP)) { samplesize = samplesizeP.intValue(); } } @Override protected RandomSampleReferencePoints makeInstance() { return new RandomSampleReferencePoints<>(samplesize); } } }