package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
import java.util.ArrayList;
import java.util.List;
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN;
import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.CorePredicate;
import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.ERiCNeighborPredicate;
import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.GeneralizedDBSCAN;
import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.MinPtsCorePredicate;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.model.CorrelationModel;
import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.StepProgress;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid;
import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.FirstNEigenPairFilter;
import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAFilteredResult;
import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAFilteredRunner;
import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.StandardCovarianceMatrixBuilder;
import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.Hierarchy;
import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.Hierarchy.Iter;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
/**
* Performs correlation clustering on the data partitioned according to local
* correlation dimensionality and builds a hierarchy of correlation clusters
* that allows multiple inheritance from the clustering result.
*
* Reference:
*
* E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, and A. Zimek:
* On Exploring Complex Relationships of Correlation Clusters.
* In Proc. 19th International Conference on Scientific and Statistical Database
* Management (SSDBM 2007), Banff, Canada, 2007.
*
*
* @author Elke Achtert
*
* @apiviz.composedOf COPAC
* @apiviz.composedOf DBSCAN
* @apiviz.composedOf FirstNEigenPairFilter
* @apiviz.composedOf PCAFilteredRunner
* @apiviz.composedOf ERiCNeighborPredicate
* @apiviz.has CorrelationModel
*
* @param the type of NumberVector handled by this Algorithm
*/
@Title("ERiC: Exploring Relationships among Correlation Clusters")
@Description("Performs the DBSCAN algorithm on the data using a special distance function taking into account correlations among attributes and builds " //
+ "a hierarchy that allows multiple inheritance from the correlation clustering result.")
@Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, and A. Zimek",//
title = "On Exploring Complex Relationships of Correlation Clusters", //
booktitle = "Proc. 19th International Conference on Scientific and Statistical Database Management (SSDBM 2007), Banff, Canada, 2007", //
url = "http://dx.doi.org/10.1109/SSDBM.2007.21")
public class ERiC extends AbstractAlgorithm>> implements ClusteringAlgorithm>> {
/**
* The logger for this class.
*/
private static final Logging LOG = Logging.getLogger(ERiC.class);
/**
* ERiC Settings.
*/
private ERiC.Settings settings;
/**
* Constructor.
*
* @param settings ERiC clustering settings
*/
public ERiC(ERiC.Settings settings) {
super();
this.settings = settings;
}
/**
* Performs the ERiC algorithm on the given database.
*
* @param relation Relation to process
* @return Clustering result
*/
public Clustering> run(Database database, Relation relation) {
final int dimensionality = RelationUtil.dimensionality(relation);
StepProgress stepprog = LOG.isVerbose() ? new StepProgress(3) : null;
// Run Generalized DBSCAN
LOG.beginStep(stepprog, 1, "Preprocessing local correlation dimensionalities and partitioning data");
// FIXME: how to ensure we are running on the same relation?
ERiCNeighborPredicate.Instance npred = new ERiCNeighborPredicate(settings).instantiate(database, relation);
CorePredicate.Instance cpred = new MinPtsCorePredicate(settings.minpts).instantiate(database, TypeUtil.DBIDS);
Clustering copacResult = new GeneralizedDBSCAN.Instance<>(npred, cpred, false).run();
// extract correlation clusters
LOG.beginStep(stepprog, 2, "Extract correlation clusters");
List>>> clusterMap = extractCorrelationClusters(copacResult, relation, dimensionality, npred);
if(LOG.isDebugging()) {
StringBuilder msg = new StringBuilder("Step 2: Extract correlation clusters...");
for(int corrDim = 0; corrDim < clusterMap.size(); corrDim++) {
List>> correlationClusters = clusterMap.get(corrDim);
msg.append("\n\ncorrDim ").append(corrDim);
for(Cluster> cluster : correlationClusters) {
msg.append("\n cluster ").append(cluster).append(", ids: ").append(cluster.getIDs().size());
// .append(", level: ").append(cluster.getLevel()).append(", index: ").append(cluster.getLevelIndex());
// msg.append("\n basis " +
// cluster.getPCA().getWeakEigenvectors().toString(" ", NF) +
// " ids " + cluster.getIDs().size());
}
}
LOG.debugFine(msg.toString());
}
if(LOG.isVerbose()) {
int clusters = 0;
for(List>> correlationClusters : clusterMap) {
clusters += correlationClusters.size();
}
LOG.verbose(clusters + " clusters extracted.");
}
// build hierarchy
LOG.beginStep(stepprog, 3, "Building hierarchy");
Clustering> clustering = new Clustering<>("ERiC clustering", "eric-clustering");
buildHierarchy(clustering, clusterMap, npred);
if(LOG.isDebugging()) {
StringBuilder msg = new StringBuilder("Step 3: Build hierarchy");
for(int corrDim = 0; corrDim < clusterMap.size(); corrDim++) {
List>> correlationClusters = clusterMap.get(corrDim);
for(Cluster> cluster : correlationClusters) {
msg.append("\n cluster ").append(cluster).append(", ids: ").append(cluster.getIDs().size());
// .append(", level: ").append(cluster.getLevel()).append(", index: ").append(cluster.getLevelIndex());
for(Iter>> iter = clustering.getClusterHierarchy().iterParents(cluster); iter.valid(); iter.advance()) {
msg.append("\n parent ").append(iter.get());
}
for(Iter>> iter = clustering.getClusterHierarchy().iterChildren(cluster); iter.valid(); iter.advance()) {
msg.append("\n child ").append(iter.get());
}
}
}
LOG.debugFine(msg.toString());
}
LOG.setCompleted(stepprog);
for(Cluster> rc : clusterMap.get(clusterMap.size() - 1)) {
clustering.addToplevelCluster(rc);
}
return clustering;
}
/**
* Extracts the correlation clusters and noise from the copac result and
* returns a mapping of correlation dimension to maps of clusters within this
* correlation dimension. Each cluster is defined by the basis vectors
* defining the subspace in which the cluster appears.
*
* @param dbscanResult
*
* @param database the database containing the objects
* @param dimensionality the dimensionality of the feature space
* @param npred ERiC predicate
* @return a list of clusters for each dimensionality
*/
private List>>> extractCorrelationClusters(Clustering dbscanResult, Relation database, int dimensionality, ERiCNeighborPredicate.Instance npred) {
// result
List>>> clusterMap = new ArrayList<>();
for(int i = 0; i <= dimensionality; i++) {
clusterMap.add(new ArrayList>>());
}
// noise cluster containing all noise objects over all partitions
Cluster noise = null;
// iterate over correlation dimensions
for(Cluster clus : dbscanResult.getAllClusters()) {
DBIDs group = clus.getIDs();
int dim = clus.isNoise() ? dimensionality : npred.dimensionality(clus.getIDs().iter());
if(dim < dimensionality) {
PCAFilteredRunner pca = new PCAFilteredRunner(new StandardCovarianceMatrixBuilder(), new FirstNEigenPairFilter(dim), 1., 0.);
// get cluster list for this dimension.
List>> correlationClusters = clusterMap.get(dim);
PCAFilteredResult pcares = pca.processIds(group, database);
V centroid = Centroid.make(database, group).toVector(database);
Cluster> correlationCluster = new Cluster<>("[" + dim + "_" + correlationClusters.size() + "]", group, new CorrelationModel<>(pcares, centroid));
correlationClusters.add(correlationCluster);
}
// partition containing noise
else {
if(noise == null) {
noise = clus;
}
else {
ModifiableDBIDs merged = DBIDUtil.newHashSet(noise.getIDs());
merged.addDBIDs(clus.getIDs());
noise.setIDs(merged);
}
}
}
if(noise != null && noise.size() > 0) {
// get cluster list for this dimension.
List>> correlationClusters = clusterMap.get(dimensionality);
PCAFilteredRunner pca = new PCAFilteredRunner(new StandardCovarianceMatrixBuilder(), new FirstNEigenPairFilter(dimensionality), 1., 0.);
PCAFilteredResult pcares = pca.processIds(noise.getIDs(), database);
V centroid = Centroid.make(database, noise.getIDs()).toVector(database);
Cluster> correlationCluster = new Cluster<>("[noise]", noise.getIDs(), new CorrelationModel<>(pcares, centroid));
correlationClusters.add(correlationCluster);
}
// Delete dimensionalities not found.
for(int i = dimensionality; i > 0; i--) {
if(clusterMap.get(i).size() > 0) {
break;
}
clusterMap.remove(i);
}
return clusterMap;
}
private void buildHierarchy(Clustering> clustering, List>>> clusterMap, ERiCNeighborPredicate.Instance npred) {
StringBuilder msg = LOG.isDebuggingFine() ? new StringBuilder() : null;
Hierarchy>> hier = clustering.getClusterHierarchy();
// Find maximum dimensionality found:
int lambda_max = clusterMap.size() - 1;
for(int childCorrDim = 0; childCorrDim < lambda_max; childCorrDim++) {
List>> children = clusterMap.get(childCorrDim);
if(msg != null) {
msg.append("\ncorrdim ").append(childCorrDim);
}
for(Cluster> child : children) {
for(int parentCorrDim = childCorrDim + 1; parentCorrDim <= lambda_max; parentCorrDim++) {
List>> parents = clusterMap.get(parentCorrDim);
for(Cluster> parent : parents) {
int subspaceDim_parent = parent.getModel().getPCAResult().getCorrelationDimension();
if(subspaceDim_parent == lambda_max && hier.numParents(child) == 0) {
clustering.addChildCluster(parent, child);
if(msg != null) {
msg.append('\n').append(parent).append(" is parent of ").append(child);
}
}
else {
boolean dist = npred.weakNeighbors(parent.getModel().getCentroid(), child.getModel().getCentroid(), parent.getModel().getPCAResult(), child.getModel().getPCAResult());
if(!dist && (hier.numParents(child) == 0 || !isParent(npred, parent, hier.iterParents(child)))) {
clustering.addChildCluster(parent, child);
if(msg != null) {
msg.append('\n').append(parent).append(" is parent of ").append(child);
}
}
}
}
}
}
}
if(msg != null) {
LOG.debugFine(msg.toString());
}
}
/**
* Returns true, if the specified parent cluster is a parent of one child of
* the children clusters.
*
* @param npred Neighborhood predicate
* @param parent the parent to be tested
* @param iter the list of children to be tested
* @return true, if the specified parent cluster is a parent of one child of
* the children clusters, false otherwise
*/
private boolean isParent(ERiCNeighborPredicate.Instance npred, Cluster> parent, Iter>> iter) {
StringBuilder msg = LOG.isDebugging() ? new StringBuilder() : null;
for(; iter.valid(); iter.advance()) {
Cluster> child = iter.get();
if(parent.getModel().getPCAResult().getCorrelationDimension() == child.getModel().getPCAResult().getCorrelationDimension()) {
return false;
}
boolean dist = npred.weakNeighbors(parent.getModel().getCentroid(), child.getModel().getCentroid(), parent.getModel().getPCAResult(), child.getModel().getPCAResult());
if(msg != null) {
msg.append("\ndist(").append(child).append(" - ").append(parent).append(") = ").append(dist);
}
if(dist) {
if(msg != null) {
LOG.debugFine(msg);
}
return true;
}
}
if(msg != null) {
LOG.debugFine(msg.toString());
}
return false;
}
@Override
public TypeInformation[] getInputTypeRestriction() {
return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
}
@Override
protected Logging getLogger() {
return LOG;
}
/**
* Class to wrap the ERiC settings.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
public static class Settings {
/**
* Neighborhood size.
*/
public int k;
/**
* Class to compute PCA.
*/
public PCAFilteredRunner pca;
/**
* Parameter to specify the threshold for approximate linear dependency: the
* strong eigenvectors of q are approximately linear dependent from the
* strong eigenvectors p if the following condition holds for all strong
* eigenvectors q_i of q (lambda_q < lambda_p): q_i' * M^check_p * q_i <=
* delta^2, must be a double equal to or greater than 0.
*/
public double delta;
/**
* Parameter to specify the threshold for the maximum distance between two
* approximately linear dependent subspaces of two objects p and q (lambda_q
* < lambda_p) before considering them as parallel, must be a double equal
* to or greater than 0.
*/
public double tau;
/**
* Minimum neighborhood size (density).
*/
public int minpts;
/**
* Parameterization class.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
public static class Parameterizer extends AbstractParameterizer {
/**
* Size for the kNN neighborhood used in the PCA step of ERiC.
*/
public static final OptionID K_ID = new OptionID("eric.k", "Number of neighbors to use for PCA.");
/**
* Parameter to specify the threshold for approximate linear dependency:
* the strong eigenvectors of q are approximately linear dependent from
* the strong eigenvectors p if the following condition holds for all
* strong eigenvectors q_i of q (lambda_q < lambda_p): q_i' * M^check_p *
* q_i <= delta^2, must be a double equal to or greater than 0.
*
* Default value: {@code 0.1}
*
*
* Key: {@code -ericdf.delta}
*
*/
public static final OptionID DELTA_ID = new OptionID("ericdf.delta", "Threshold for approximate linear dependency: " + "the strong eigenvectors of q are approximately linear dependent " + "from the strong eigenvectors p if the following condition " + "holds for all stroneg eigenvectors q_i of q (lambda_q < lambda_p): " + "q_i' * M^check_p * q_i <= delta^2.");
/**
* Parameter to specify the threshold for the maximum distance between two
* approximately linear dependent subspaces of two objects p and q
* (lambda_q < lambda_p) before considering them as parallel, must be a
* double equal to or greater than 0.
*
* Default value: {@code 0.1}
*
*
* Key: {@code -ericdf.tau}
*
*/
public static final OptionID TAU_ID = new OptionID("ericdf.tau", "Threshold for the maximum distance between two approximately linear " + "dependent subspaces of two objects p and q " + "(lambda_q < lambda_p) before considering them as parallel.");
/**
* Settings to build.
*/
Settings settings;
@Override
public void makeOptions(Parameterization config) {
settings = new Settings();
configK(config);
// TODO: allow using other PCA runners?
settings.pca = config.tryInstantiate(PCAFilteredRunner.class);
configDelta(config);
configTau(config);
configMinPts(config);
}
/**
* Configure the kNN parameter.
*
* @param config Parameter source
*/
protected void configK(Parameterization config) {
IntParameter kP = new IntParameter(K_ID) //
.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(kP)) {
settings.k = kP.intValue();
}
}
/**
* Configure the delta parameter.
*
* @param config Parameter source
*/
protected void configDelta(Parameterization config) {
final DoubleParameter deltaP = new DoubleParameter(DELTA_ID, 0.1);
deltaP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
if(config.grab(deltaP)) {
settings.delta = deltaP.doubleValue();
}
}
/**
* Configure the tau parameter.
*
* @param config Parameter source
*/
protected void configTau(Parameterization config) {
final DoubleParameter tauP = new DoubleParameter(TAU_ID, 0.1);
tauP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
if(config.grab(tauP)) {
settings.tau = tauP.doubleValue();
}
}
/**
* Configure the minPts aka "mu" parameter.
*
* @param config Parameter source
*/
protected void configMinPts(Parameterization config) {
IntParameter minptsP = new IntParameter(DBSCAN.Parameterizer.MINPTS_ID) //
.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(minptsP)) {
settings.minpts = minptsP.intValue();
}
}
@Override
public Settings makeInstance() {
return settings;
}
}
}
/**
* Parameterization class.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
public static class Parameterizer extends AbstractParameterizer {
/**
* The settings to use.
*/
protected ERiC.Settings settings;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
settings = config.tryInstantiate(ERiC.Settings.class);
}
@Override
protected ERiC makeInstance() {
return new ERiC<>(settings);
}
}
}