package de.lmu.ifi.dbs.elki.index.preprocessed.preference; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import java.util.ArrayList; import java.util.Arrays; import java.util.BitSet; import java.util.HashMap; import java.util.List; import java.util.Map; import de.lmu.ifi.dbs.elki.algorithm.APRIORI; import de.lmu.ifi.dbs.elki.data.Bit; import de.lmu.ifi.dbs.elki.data.BitVector; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; import de.lmu.ifi.dbs.elki.database.HashmapDatabase; import de.lmu.ifi.dbs.elki.database.UpdatableDatabase; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.distance.PrimitiveDistanceQuery; import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.datasource.bundle.SingleObjectBundle; import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.DimensionSelectingDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distanceresultlist.DistanceDBIDResult; import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.result.AprioriResult; import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil; import de.lmu.ifi.dbs.elki.utilities.FormatUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages; import de.lmu.ifi.dbs.elki.utilities.exceptions.UnableToComplyException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.WrongParameterValueException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleListParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.EnumParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; /** * Preprocessor for DiSH preference vector assignment to objects of a certain * database. * * @author Elke Achtert * * @param Vector type */ @Description("Computes the preference vector of objects of a certain database according to the DiSH algorithm.") public class DiSHPreferenceVectorIndex> extends AbstractPreferenceVectorIndex implements PreferenceVectorIndex { /** * Logger to use. */ private static final Logging LOG = Logging.getLogger(DiSHPreferenceVectorIndex.class); /** * Available strategies for determination of the preference vector. * * @apiviz.exclude */ public enum Strategy { /** * Apriori strategy. */ APRIORI, /** * Max intersection strategy. */ MAX_INTERSECTION } /** * The epsilon value for each dimension. */ protected DoubleDistance[] epsilon; /** * Threshold for minimum number of points in the neighborhood. */ protected int minpts; /** * The strategy to determine the preference vector. */ protected Strategy strategy; /** * Constructor. * * @param relation Relation to use * @param epsilon Epsilon value * @param minpts MinPts value * @param strategy Strategy */ public DiSHPreferenceVectorIndex(Relation relation, DoubleDistance[] epsilon, int minpts, Strategy strategy) { super(relation); this.epsilon = epsilon; this.minpts = minpts; this.strategy = strategy; } @Override protected void preprocess() { if(relation == null || relation.size() == 0) { throw new IllegalArgumentException(ExceptionMessages.DATABASE_EMPTY); } storage = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, BitSet.class); if(LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append("\n eps ").append(Arrays.asList(epsilon)); msg.append("\n minpts ").append(minpts); msg.append("\n strategy ").append(strategy); LOG.debugFine(msg.toString()); } long start = System.currentTimeMillis(); FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Preprocessing preference vector", relation.size(), LOG) : null; // only one epsilon value specified int dim = RelationUtil.dimensionality(relation); if(epsilon.length == 1 && dim != 1) { DoubleDistance eps = epsilon[0]; epsilon = new DoubleDistance[dim]; Arrays.fill(epsilon, eps); } // epsilons as string RangeQuery[] rangeQueries = initRangeQueries(relation, dim); for(DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) { StringBuilder msg = new StringBuilder(); if(LOG.isDebugging()) { msg.append("\nid = ").append(DBIDUtil.toString(it)); // msg.append(" ").append(database.get(id)); // msg.append(" ").append(database.getObjectLabelQuery().get(id)); } // determine neighbors in each dimension ModifiableDBIDs[] allNeighbors = new ModifiableDBIDs[dim]; for(int d = 0; d < dim; d++) { DistanceDBIDResult qrList = rangeQueries[d].getRangeForDBID(it, epsilon[d]); allNeighbors[d] = DBIDUtil.newHashSet(qrList); } if(LOG.isDebugging()) { for(int d = 0; d < dim; d++) { msg.append("\n neighbors [").append(d).append(']'); msg.append(" (").append(allNeighbors[d].size()).append(") = "); msg.append(allNeighbors[d]); } } try { storage.put(it, determinePreferenceVector(relation, allNeighbors, msg)); } catch(UnableToComplyException e) { throw new IllegalStateException(e); } if(LOG.isDebugging()) { LOG.debugFine(msg.toString()); } if(progress != null) { progress.incrementProcessed(LOG); } } if(progress != null) { progress.ensureCompleted(LOG); } long end = System.currentTimeMillis(); // TODO: re-add timing code! if(LOG.isVerbose()) { long elapsedTime = end - start; LOG.verbose(this.getClass().getName() + " runtime: " + elapsedTime + " milliseconds."); } } /** * Determines the preference vector according to the specified neighbor ids. * * @param relation the database storing the objects * @param neighborIDs the list of ids of the neighbors in each dimension * @param msg a string buffer for debug messages * @return the preference vector * * @throws de.lmu.ifi.dbs.elki.utilities.exceptions.UnableToComplyException */ private BitSet determinePreferenceVector(Relation relation, ModifiableDBIDs[] neighborIDs, StringBuilder msg) throws UnableToComplyException { if(strategy.equals(Strategy.APRIORI)) { return determinePreferenceVectorByApriori(relation, neighborIDs, msg); } else if(strategy.equals(Strategy.MAX_INTERSECTION)) { return determinePreferenceVectorByMaxIntersection(neighborIDs, msg); } else { throw new IllegalStateException("Should never happen!"); } } /** * Determines the preference vector with the apriori strategy. * * @param relation the database storing the objects * @param neighborIDs the list of ids of the neighbors in each dimension * @param msg a string buffer for debug messages * @return the preference vector * * @throws de.lmu.ifi.dbs.elki.utilities.exceptions.UnableToComplyException * */ private BitSet determinePreferenceVectorByApriori(Relation relation, ModifiableDBIDs[] neighborIDs, StringBuilder msg) throws UnableToComplyException { int dimensionality = neighborIDs.length; // database for apriori UpdatableDatabase apriori_db = new HashmapDatabase(); SimpleTypeInformation bitmeta = new VectorFieldTypeInformation(BitVector.class, dimensionality); for(DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) { Bit[] bits = new Bit[dimensionality]; boolean allFalse = true; for(int d = 0; d < dimensionality; d++) { if(neighborIDs[d].contains(it)) { bits[d] = new Bit(true); allFalse = false; } else { bits[d] = new Bit(false); } } if(!allFalse) { SingleObjectBundle oaa = new SingleObjectBundle(); oaa.append(bitmeta, new BitVector(bits)); apriori_db.insert(oaa); } } APRIORI apriori = new APRIORI(minpts); AprioriResult aprioriResult = apriori.run(apriori_db); // result of apriori List frequentItemsets = aprioriResult.getSolution(); Map supports = aprioriResult.getSupports(); if(LOG.isDebugging()) { msg.append("\n Frequent itemsets: ").append(frequentItemsets); msg.append("\n All supports: ").append(supports); } int maxSupport = 0; int maxCardinality = 0; BitSet preferenceVector = new BitSet(); for(BitSet bitSet : frequentItemsets) { int cardinality = bitSet.cardinality(); if((maxCardinality < cardinality) || (maxCardinality == cardinality && maxSupport == supports.get(bitSet))) { preferenceVector = bitSet; maxCardinality = cardinality; maxSupport = supports.get(bitSet); } } if(LOG.isDebugging()) { msg.append("\n preference "); msg.append(FormatUtil.format(dimensionality, preferenceVector)); msg.append('\n'); LOG.debugFine(msg.toString()); } return preferenceVector; } /** * Determines the preference vector with the max intersection strategy. * * @param neighborIDs the list of ids of the neighbors in each dimension * @param msg a string buffer for debug messages * @return the preference vector */ private BitSet determinePreferenceVectorByMaxIntersection(ModifiableDBIDs[] neighborIDs, StringBuilder msg) { int dimensionality = neighborIDs.length; BitSet preferenceVector = new BitSet(dimensionality); Map candidates = new HashMap(dimensionality); for(int i = 0; i < dimensionality; i++) { ModifiableDBIDs s_i = neighborIDs[i]; if(s_i.size() > minpts) { candidates.put(i, s_i); } } if(LOG.isDebugging()) { msg.append("\n candidates ").append(candidates.keySet()); } if(!candidates.isEmpty()) { int i = max(candidates); ModifiableDBIDs intersection = candidates.remove(i); preferenceVector.set(i); while(!candidates.isEmpty()) { ModifiableDBIDs newIntersection = DBIDUtil.newHashSet(); i = maxIntersection(candidates, intersection, newIntersection); ModifiableDBIDs s_i = candidates.remove(i); // TODO: aren't we re-computing the same intersection here? newIntersection = DBIDUtil.intersection(intersection, s_i); intersection = newIntersection; if(intersection.size() < minpts) { break; } else { preferenceVector.set(i); } } } if(LOG.isDebugging()) { msg.append("\n preference "); msg.append(FormatUtil.format(dimensionality, preferenceVector)); msg.append('\n'); LOG.debug(msg.toString()); } return preferenceVector; } /** * Returns the set with the maximum size contained in the specified map. * * @param candidates the map containing the sets * @return the set with the maximum size */ private int max(Map candidates) { DBIDs maxSet = null; Integer maxDim = null; for(Integer nextDim : candidates.keySet()) { DBIDs nextSet = candidates.get(nextDim); if(maxSet == null || maxSet.size() < nextSet.size()) { maxSet = nextSet; maxDim = nextDim; } } return maxDim; } /** * Returns the index of the set having the maximum intersection set with the * specified set contained in the specified map. * * @param candidates the map containing the sets * @param set the set to intersect with * @param result the set to put the result in * @return the set with the maximum size */ private int maxIntersection(Map candidates, DBIDs set, ModifiableDBIDs result) { Integer maxDim = null; for(Integer nextDim : candidates.keySet()) { DBIDs nextSet = candidates.get(nextDim); ModifiableDBIDs nextIntersection = DBIDUtil.intersection(set, nextSet); if(result.size() < nextIntersection.size()) { result = nextIntersection; maxDim = nextDim; } } return maxDim; } /** * Initializes the dimension selecting distancefunctions to determine the * preference vectors. * * @param relation the database storing the objects * @param dimensionality the dimensionality of the objects * @return the dimension selecting distancefunctions to determine the * preference vectors */ private RangeQuery[] initRangeQueries(Relation relation, int dimensionality) { Class> rqcls = ClassGenericsUtil.uglyCastIntoSubclass(RangeQuery.class); RangeQuery[] rangeQueries = ClassGenericsUtil.newArrayOfNull(dimensionality, rqcls); for(int d = 0; d < dimensionality; d++) { rangeQueries[d] = relation.getDatabase().getRangeQuery(new PrimitiveDistanceQuery(relation, new DimensionSelectingDistanceFunction(d))); } return rangeQueries; } @Override protected Logging getLogger() { return LOG; } @Override public String getLongName() { return "DiSH Preference Vectors"; } @Override public String getShortName() { return "dish-pref"; } /** * Factory class. * * @author Erich Schubert * * @apiviz.stereotype factory * @apiviz.uses DiSHPreferenceVectorIndex oneway - - «create» * * @param Vector type */ public static class Factory> extends AbstractPreferenceVectorIndex.Factory> { /** * The default value for epsilon. */ public static final DoubleDistance DEFAULT_EPSILON = new DoubleDistance(0.001); /** * A comma separated list of positive doubles specifying the maximum radius * of the neighborhood to be considered in each dimension for determination * of the preference vector (default is {@link #DEFAULT_EPSILON} in each * dimension). If only one value is specified, this value will be used for * each dimension. * *

* Key: {@code -dish.epsilon} *

*

* Default value: {@link #DEFAULT_EPSILON} *

*/ public static final OptionID EPSILON_ID = new OptionID("dish.epsilon", "A comma separated list of positive doubles specifying the " + "maximum radius of the neighborhood to be " + "considered in each dimension for determination of " + "the preference vector " + "(default is " + DEFAULT_EPSILON + " in each dimension). " + "If only one value is specified, this value " + "will be used for each dimension."); /** * Option name for {@link #MINPTS_ID}. */ public static final String MINPTS_P = "dish.minpts"; /** * Description for the determination of the preference vector. */ private static final String CONDITION = "The value of the preference vector in dimension d_i is set to 1 " + "if the epsilon neighborhood contains more than " + MINPTS_P + " points and the following condition holds: " + "for all dimensions d_j: " + "|neighbors(d_i) intersection neighbors(d_j)| >= " + MINPTS_P + "."; /** * Positive threshold for minimum numbers of points in the * epsilon-neighborhood of a point, must satisfy following * {@link #CONDITION}. * *

* Key: {@code -dish.minpts} *

*/ public static final OptionID MINPTS_ID = new OptionID(MINPTS_P, "Positive threshold for minumum numbers of points in the epsilon-" + "neighborhood of a point. " + CONDITION); /** * Default strategy. */ public static final Strategy DEFAULT_STRATEGY = Strategy.MAX_INTERSECTION; /** * The strategy for determination of the preference vector, available * strategies are: {@link Strategy#APRIORI } and * {@link Strategy#MAX_INTERSECTION}. * *

* Key: {@code -dish.strategy} *

*

* Default value: {@link #DEFAULT_STRATEGY} *

*/ public static final OptionID STRATEGY_ID = new OptionID("dish.strategy", "The strategy for determination of the preference vector, " + "available strategies are: [" + Strategy.APRIORI + "| " + Strategy.MAX_INTERSECTION + "]" + "(default is " + DEFAULT_STRATEGY + ")"); /** * The epsilon value for each dimension. */ protected DoubleDistance[] epsilon; /** * Threshold for minimum number of points in the neighborhood. */ protected int minpts; /** * The strategy to determine the preference vector. */ protected Strategy strategy; /** * Constructor. * * @param epsilon Epsilon * @param minpts Minpts * @param strategy Strategy */ public Factory(DoubleDistance[] epsilon, int minpts, Strategy strategy) { super(); this.epsilon = epsilon; this.minpts = minpts; this.strategy = strategy; } @Override public DiSHPreferenceVectorIndex instantiate(Relation relation) { return new DiSHPreferenceVectorIndex(relation, epsilon, minpts, strategy); } /** * Return the minpts value. * * @return minpts */ public int getMinpts() { return minpts; } /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ public static class Parameterizer> extends AbstractParameterizer { /** * The epsilon value for each dimension. */ protected DoubleDistance[] epsilon; /** * Threshold for minimum number of points in the neighborhood. */ protected int minpts; /** * The strategy to determine the preference vector. */ protected Strategy strategy; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); final IntParameter minptsP = new IntParameter(MINPTS_ID); minptsP.addConstraint(new GreaterConstraint(0)); if(config.grab(minptsP)) { minpts = minptsP.getValue(); } // parameter epsilon // todo: constraint auf positive werte List defaultEps = new ArrayList(); defaultEps.add(DEFAULT_EPSILON.doubleValue()); final DoubleListParameter epsilonP = new DoubleListParameter(EPSILON_ID, true); epsilonP.setDefaultValue(defaultEps); if(config.grab(epsilonP)) { List eps_list = epsilonP.getValue(); epsilon = new DoubleDistance[eps_list.size()]; for(int d = 0; d < eps_list.size(); d++) { epsilon[d] = new DoubleDistance(eps_list.get(d)); if(epsilon[d].doubleValue() < 0) { config.reportError(new WrongParameterValueException(epsilonP, eps_list.toString())); } } } // parameter strategy final EnumParameter strategyP = new EnumParameter(STRATEGY_ID, Strategy.class, DEFAULT_STRATEGY); if(config.grab(strategyP)) { strategy = strategyP.getValue(); } } @Override protected Factory makeInstance() { return new Factory(epsilon, minpts, strategy); } } } }