package de.lmu.ifi.dbs.elki.algorithm.clustering.uncertain;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
Copyright (C) 2015
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
import java.util.Random;
import de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN;
import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.GeneralizedDBSCAN;
import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.NeighborPredicate;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.uncertain.DiscreteUncertainObject;
import de.lmu.ifi.dbs.elki.data.uncertain.UncertainObject;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction;
import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
/**
* Density-based Clustering of Applications with Noise and Fuzzy objects
* (FDBSCAN) is an Algorithm to find sets in a fuzzy database that are
* density-connected with minimum probability.
*
* Reference:
*
* Hans-Peter Kriegel and Martin Pfeifle:
* Density-based clustering of uncertain data
* In Proc. 11th ACM Int. Conf. on Knowledge Discovery and Data Mining (SIGKDD),
* Chicago, IL, 2005.
*
*
* This class is a NeighborPredicate presenting this Algorithm in use with
* {@link GeneralizedDBSCAN}
.
*
* Only Euclidean distance is supported, because of the pruning strategy
* described in the original article which needs minimum and maximum distances
* of bounding rectangles. Index support is not yet available.
*
* @author Alexander Koos
* @author Erich Schubert
*
* @apiviz.composedOf Instance
*/
@Title("FDBSCAN: Density-based Clustering of Applications with Noise on fuzzy objects")
@Description("Algorithm to find density-connected sets in a database consisting of uncertain/fuzzy objects based on the" //
+ " parameters 'minpts', 'epsilon', 'samplesize', and (if used) 'threshold'")
@Reference(authors = "Hans-Peter Kriegel and Martin Pfeifle", //
title = "Density-based clustering of uncertain data", //
booktitle = "Proc. 11th ACM Int. Conf. on Knowledge Discovery and Data Mining (KDD'05)", //
url = "http://dx.doi.org/10.1145/1081870.1081955")
public class FDBSCANNeighborPredicate implements NeighborPredicate {
/**
* Epsilon radius
*/
protected double epsilon;
/**
* The size of samplesets that should be drawn for neighborcheck.
*/
protected int sampleSize;
/**
* The relative amount of epsilon-close pairings determined by the
* neighborcheck.
*/
protected double threshold;
/**
* The Random
object to draw the samples with.
*/
protected RandomFactory rand;
/**
* Constructor.
*
* @param epsilon Maximum distance
* @param sampleSize Sampling size
* @param threshold Threshold on how many samples are within the radius
* @param seed Random generator for sampling
*/
public FDBSCANNeighborPredicate(double epsilon, int sampleSize, double threshold, RandomFactory seed) {
super();
this.epsilon = epsilon;
this.sampleSize = sampleSize;
this.threshold = threshold;
this.rand = seed;
}
@SuppressWarnings("unchecked")
@Override
public NeighborPredicate.Instance instantiate(Database database, SimpleTypeInformation> type) {
Relation extends UncertainObject> relation = database.getRelation(TypeUtil.UNCERTAIN_OBJECT_FIELD);
return (NeighborPredicate.Instance) new Instance(epsilon, sampleSize, threshold, relation, rand);
}
@Override
public TypeInformation getInputTypeRestriction() {
return TypeUtil.UNCERTAIN_OBJECT_FIELD;
}
@Override
public SimpleTypeInformation>[] getOutputType() {
return new SimpleTypeInformation>[] { TypeUtil.DBIDS };
}
/**
* Instance of the neighbor predicate.
*
* @author Alexander Koos
* @author Erich Schubert
*/
public static class Instance implements NeighborPredicate.Instance {
/**
* The epsilon distance a neighbor may have at most.
*/
private double epsilon, epsilonsq;
/**
* The size of samplesets that should be drawn for neighborcheck.
*/
private int sampleSize;
/**
* The relative amount of epsilon-close pairings determined by the
* neighborcheck.
*/
private double threshold;
/**
* The relation holding the uncertain objects.
*/
private Relation extends UncertainObject> relation;
/**
* The random generator to draw the samples with.
*/
private Random rand;
/**
*
* Constructor.
*
* @param epsilon Maximum distance
* @param sampleSize Sampling size
* @param threshold Threshold on how many samples are within the radius
* @param relation Data relation
* @param rand Random generator for sampling
*/
public Instance(double epsilon, int sampleSize, double threshold, Relation extends UncertainObject> relation, RandomFactory rand) {
super();
this.epsilon = epsilon;
this.epsilonsq = epsilon * epsilon;
this.sampleSize = sampleSize;
this.threshold = threshold;
this.relation = relation;
this.rand = rand.getRandom();
}
@Override
public DBIDs getNeighbors(DBIDRef reference) {
UncertainObject referenceObject = relation.get(reference);
ModifiableDBIDs resultList = DBIDUtil.newArray();
candidates: for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
if(DBIDUtil.equal(reference, iter)) {
resultList.add(iter); // Always include the query point.
continue;
}
UncertainObject comparisonObject = relation.get(iter);
double mindistsq = 0., maxdistsq = 0.;
for(int d = 0; d < referenceObject.getDimensionality(); d++) {
final double rmin = referenceObject.getMin(d);
final double rmax = referenceObject.getMax(d);
final double cmin = comparisonObject.getMin(d);
final double cmax = comparisonObject.getMax(d);
// Minimum distance in this dimension:
double mindelta = (cmin > rmax) ? cmin - rmax : (rmin > cmax) ? rmin - cmax : 0.;
// True negative in this single dimension:
if(mindelta > epsilon) {
continue candidates;
}
mindistsq += mindelta * mindelta;
// True negative by aggregated distances:
if(mindistsq > epsilonsq) {
continue candidates;
}
// Maximum distance in this dimension:
double m1 = Math.abs(rmin - cmax), m2 = Math.abs(cmin - rmax);
double maxdelta = m1 > m2 ? m1 : m2;
maxdistsq += maxdelta * maxdelta;
}
// True positive:
if(maxdistsq <= epsilonsq) {
resultList.add(iter);
continue;
}
// Perform the expensive checks:
if(checkSamples(referenceObject, comparisonObject)) {
resultList.add(iter);
continue;
}
}
return resultList;
}
private boolean checkSamples(UncertainObject o1, UncertainObject o2) {
final SquaredEuclideanDistanceFunction distance = SquaredEuclideanDistanceFunction.STATIC;
// Optimization for discrete objects:
if(o1 instanceof DiscreteUncertainObject && o2 instanceof DiscreteUncertainObject) {
DiscreteUncertainObject d1 = (DiscreteUncertainObject) o1;
DiscreteUncertainObject d2 = (DiscreteUncertainObject) o2;
final int l1 = d1.getNumberSamples(), l2 = d2.getNumberSamples();
final double limit = threshold * l1 * l2;
int count = 0;
for(int i = 0; i < l1; i++) {
NumberVector s1 = d1.getSample(i);
for(int j = 0; j < l2; j++) {
NumberVector s2 = d2.getSample(j);
if(distance.distance(s2, s1) <= epsilonsq) {
// Keep track of how many are epsilon-close
count++;
// Stop early:
if(count >= limit) {
return true;
}
}
}
}
return false;
}
final double limit = threshold * sampleSize * sampleSize;
int count = 0;
for(int j = 0; j < sampleSize; j++) {
NumberVector s1 = o1.drawSample(rand);
for(int i = 0; i < sampleSize; i++) {
NumberVector s2 = o2.drawSample(rand);
if(distance.distance(s2, s1) <= epsilonsq) {
// Keep track of how many are epsilon-close
count++;
// Stop early:
if(count >= limit) {
return true;
}
}
}
}
return false;
}
@Override
public DBIDs getIDs() {
return relation.getDBIDs();
}
@Override
public DBIDIter iterDBIDs(DBIDs neighbors) {
return neighbors.iter();
}
}
/**
* Parameterizer class.
*
* @author Alexander Koos
* @author Erich Schubert
*
* @apiviz.exclude
*/
public static class Parameterizer extends AbstractParameterizer {
/**
* Number of samples per uncertain object.
*/
public final static OptionID SAMPLE_SIZE_ID = new OptionID("fdbscan.samplesize", //
"The number of samples to draw from each uncertain object to determine the epsilon-neighborhood.");
/**
* Threshold for epsilon-neighborhood, defaults to 0.5.
*/
public final static OptionID THRESHOLD_ID = new OptionID("fdbscan.threshold", //
"The amount of samples that have to be epsilon-close for two objects to be neighbors.");
/**
* Seed for random sample draw.
*/
public final static OptionID SEED_ID = new OptionID("fdbscan.seed", //
"Random generator used to draw samples.");
/**
* Epsilon radius
*/
protected double epsilon;
/**
* The size of samplesets that should be drawn for neighborcheck.
*/
protected int sampleSize;
/**
* The relative amount of epsilon-close pairings determined by the
* neighborcheck.
*/
protected double threshold;
/**
* Random generator.
*/
protected RandomFactory seed;
@Override
public void makeOptions(Parameterization config) {
super.makeOptions(config);
DoubleParameter epsilonP = new DoubleParameter(DBSCAN.Parameterizer.EPSILON_ID) //
.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
if(config.grab(epsilonP)) {
epsilon = epsilonP.doubleValue();
}
IntParameter sampleSizep = new IntParameter(SAMPLE_SIZE_ID) //
.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(sampleSizep)) {
sampleSize = sampleSizep.intValue();
}
DoubleParameter thresholdp = new DoubleParameter(THRESHOLD_ID, 0.5) //
.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE) //
.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE);
if(config.grab(thresholdp)) {
threshold = thresholdp.doubleValue();
}
RandomParameter seedp = new RandomParameter(SEED_ID);
if(config.grab(seedp)) {
seed = seedp.getValue();
}
}
@Override
protected FDBSCANNeighborPredicate makeInstance() {
return new FDBSCANNeighborPredicate(epsilon, sampleSize, threshold, seed);
}
}
}