package de.lmu.ifi.dbs.elki.algorithm; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2011 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import java.text.NumberFormat; import java.util.List; import java.util.Locale; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.model.CorrelationAnalysisSolution; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem; import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAFilteredResult; import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAFilteredRunner; import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil; import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; import de.lmu.ifi.dbs.elki.utilities.FormatUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; /** *

* Dependency derivator computes quantitatively linear dependencies among * attributes of a given dataset based on a linear correlation PCA. *

* * Reference:
* E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, A. Zimek: Deriving * Quantitative Dependencies for Correlation Clusters.
* In Proc. 12th Int. Conf. on Knowledge Discovery and Data Mining (KDD '06), * Philadelphia, PA 2006.

* * @author Arthur Zimek * @param the type of FeatureVector handled by this Algorithm * @param the type of Distance used by this Algorithm */ @Title("Dependency Derivator: Deriving numerical inter-dependencies on data") @Description("Derives an equality-system describing dependencies between attributes in a correlation-cluster") @Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, A. Zimek", title = "Deriving Quantitative Dependencies for Correlation Clusters", booktitle = "Proc. 12th Int. Conf. on Knowledge Discovery and Data Mining (KDD '06), Philadelphia, PA 2006.", url = "http://dx.doi.org/10.1145/1150402.1150408") public class DependencyDerivator, D extends Distance> extends AbstractPrimitiveDistanceBasedAlgorithm> { /** * The logger for this class. */ private static final Logging logger = Logging.getLogger(DependencyDerivator.class); /** * Flag to use random sample (use knn query around centroid, if flag is not * set). */ public static final OptionID DEPENDENCY_DERIVATOR_RANDOM_SAMPLE = OptionID.getOrCreateOptionID("derivator.randomSample", "Flag to use random sample (use knn query around centroid, if flag is not set)."); /** * Parameter to specify the threshold for output accuracy fraction digits, * must be an integer equal to or greater than 0. */ public static final OptionID OUTPUT_ACCURACY_ID = OptionID.getOrCreateOptionID("derivator.accuracy", "Threshold for output accuracy fraction digits."); /** * Optional parameter to specify the treshold for the size of the random * sample to use, must be an integer greater than 0. *

* Default value: the size of the complete dataset *

*/ public static final OptionID SAMPLE_SIZE_ID = OptionID.getOrCreateOptionID("derivator.sampleSize", "Threshold for the size of the random sample to use. " + "Default value is size of the complete dataset."); /** * Holds the value of {@link #SAMPLE_SIZE_ID}. */ private final int sampleSize; /** * Holds the object performing the pca. */ private final PCAFilteredRunner pca; /** * Number format for output of solution. */ public final NumberFormat NF; /** * Flag for random sampling vs. kNN */ private final boolean randomsample; /** * Constructor. * * @param distanceFunction distance function * @param nf Number format * @param pca PCA runner * @param sampleSize sample size * @param randomsample flag for random sampling */ public DependencyDerivator(PrimitiveDistanceFunction distanceFunction, NumberFormat nf, PCAFilteredRunner pca, int sampleSize, boolean randomsample) { super(distanceFunction); this.NF = nf; this.pca = pca; this.sampleSize = sampleSize; this.randomsample = randomsample; } /** * Computes quantitatively linear dependencies among the attributes of the * given database based on a linear correlation PCA. * * @param database the database to run this DependencyDerivator on * @param relation the relation to use * @return the CorrelationAnalysisSolution computed by this * DependencyDerivator */ public CorrelationAnalysisSolution run(Database database, Relation relation) throws IllegalStateException { if(logger.isVerbose()) { logger.verbose("retrieving database objects..."); } V centroidDV = DatabaseUtil.centroid(relation); DBIDs ids; if(this.sampleSize > 0) { if(randomsample) { ids = DBIDUtil.randomSample(relation.getDBIDs(), this.sampleSize, 1); } else { DistanceQuery distanceQuery = database.getDistanceQuery(relation, getDistanceFunction()); List> queryResults = database.getKNNQuery(distanceQuery, this.sampleSize).getKNNForObject(centroidDV, this.sampleSize); ModifiableDBIDs tids = DBIDUtil.newHashSet(this.sampleSize); for(DistanceResultPair qr : queryResults) { tids.add(qr.getDBID()); } // Cast to non-modifiable ids = tids; } } else { ids = relation.getDBIDs(); } return generateModel(relation, ids, centroidDV); } /** * Runs the pca on the given set of IDs. The centroid is computed from the * given ids. * * @param db the database * @param ids the set of ids * @return a matrix of equations describing the dependencies */ public CorrelationAnalysisSolution generateModel(Relation db, DBIDs ids) { V centroidDV = DatabaseUtil.centroid(db, ids); return generateModel(db, ids, centroidDV); } /** * Runs the pca on the given set of IDs and for the given centroid. * * @param db the database * @param ids the set of ids * @param centroidDV the centroid * @return a matrix of equations describing the dependencies */ public CorrelationAnalysisSolution generateModel(Relation db, DBIDs ids, V centroidDV) { CorrelationAnalysisSolution sol; if(logger.isDebuggingFine()) { logger.debugFine("PCA..."); } PCAFilteredResult pcares = pca.processIds(ids, db); // Matrix weakEigenvectors = // pca.getEigenvectors().times(pca.selectionMatrixOfWeakEigenvectors()); Matrix weakEigenvectors = pcares.getWeakEigenvectors(); // Matrix strongEigenvectors = // pca.getEigenvectors().times(pca.selectionMatrixOfStrongEigenvectors()); Matrix strongEigenvectors = pcares.getStrongEigenvectors(); Vector centroid = centroidDV.getColumnVector(); // TODO: what if we don't have any weak eigenvectors? if(weakEigenvectors.getColumnDimensionality() == 0) { sol = new CorrelationAnalysisSolution(null, db, strongEigenvectors, weakEigenvectors, pcares.similarityMatrix(), centroid); } else { Matrix transposedWeakEigenvectors = weakEigenvectors.transpose(); if(logger.isDebugging()) { StringBuilder log = new StringBuilder(); log.append("Strong Eigenvectors:\n"); log.append(FormatUtil.format(pcares.getEigenvectors().times(pcares.selectionMatrixOfStrongEigenvectors()), NF)).append('\n'); log.append("Transposed weak Eigenvectors:\n"); log.append(FormatUtil.format(transposedWeakEigenvectors, NF)).append('\n'); log.append("Eigenvalues:\n"); log.append(FormatUtil.format(pcares.getEigenvalues(), " , ", 2)); logger.debugFine(log.toString()); } Vector B = transposedWeakEigenvectors.times(centroid); if(logger.isDebugging()) { StringBuilder log = new StringBuilder(); log.append("Centroid:\n").append(centroid).append('\n'); log.append("tEV * Centroid\n"); log.append(B); logger.debugFine(log.toString()); } // +1 == + B.getColumnDimensionality() Matrix gaussJordan = new Matrix(transposedWeakEigenvectors.getRowDimensionality(), transposedWeakEigenvectors.getColumnDimensionality() + 1); gaussJordan.setMatrix(0, transposedWeakEigenvectors.getRowDimensionality() - 1, 0, transposedWeakEigenvectors.getColumnDimensionality() - 1, transposedWeakEigenvectors); gaussJordan.setColumnVector(transposedWeakEigenvectors.getColumnDimensionality(), B); if(logger.isDebuggingFiner()) { logger.debugFiner("Gauss-Jordan-Elimination of " + FormatUtil.format(gaussJordan, NF)); } double[][] a = new double[transposedWeakEigenvectors.getRowDimensionality()][transposedWeakEigenvectors.getColumnDimensionality()]; double[][] we = transposedWeakEigenvectors.getArrayRef(); double[] b = B.getArrayRef(); System.arraycopy(we, 0, a, 0, transposedWeakEigenvectors.getRowDimensionality()); LinearEquationSystem lq = new LinearEquationSystem(a, b); lq.solveByTotalPivotSearch(); sol = new CorrelationAnalysisSolution(lq, db, strongEigenvectors, pcares.getWeakEigenvectors(), pcares.similarityMatrix(), centroid); if(logger.isDebuggingFine()) { StringBuilder log = new StringBuilder(); log.append("Solution:\n"); log.append("Standard deviation ").append(sol.getStandardDeviation()); log.append(lq.equationsToString(NF.getMaximumFractionDigits())); logger.debugFine(log.toString()); } } return sol; } @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); } @Override protected Logging getLogger() { return logger; } /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ public static class Parameterizer, D extends Distance> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer { protected int outputAccuracy = 0; protected int sampleSize = 0; protected boolean randomSample = false; protected PCAFilteredRunner pca = null; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); configAccuracy(config); configSampleSize(config); configRandomSampleFlag(config); Class> cls = ClassGenericsUtil.uglyCastIntoSubclass(PCAFilteredRunner.class); pca = config.tryInstantiate(cls); } public void configRandomSampleFlag(Parameterization config) { Flag randomSampleF = new Flag(DEPENDENCY_DERIVATOR_RANDOM_SAMPLE); if(config.grab(randomSampleF)) { randomSample = randomSampleF.getValue(); } } public void configSampleSize(Parameterization config) { IntParameter sampleSizeP = new IntParameter(SAMPLE_SIZE_ID, true); sampleSizeP.addConstraint(new GreaterConstraint(0)); if(config.grab(sampleSizeP)) { sampleSize = sampleSizeP.getValue(); } } public void configAccuracy(Parameterization config) { IntParameter outputAccuracyP = new IntParameter(OUTPUT_ACCURACY_ID, 4); outputAccuracyP.addConstraint(new GreaterEqualConstraint(0)); if(config.grab(outputAccuracyP)) { outputAccuracy = outputAccuracyP.getValue(); } } @Override protected DependencyDerivator makeInstance() { NumberFormat NF = NumberFormat.getInstance(Locale.US); NF.setMaximumFractionDigits(outputAccuracy); NF.setMinimumFractionDigits(outputAccuracy); return new DependencyDerivator(distanceFunction, NF, pca, sampleSize, randomSample); } } }