summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/math/linearalgebra/pca/PCAFilteredAutotuningRunner.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/math/linearalgebra/pca/PCAFilteredAutotuningRunner.java')
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/linearalgebra/pca/PCAFilteredAutotuningRunner.java232
1 files changed, 232 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/math/linearalgebra/pca/PCAFilteredAutotuningRunner.java b/src/de/lmu/ifi/dbs/elki/math/linearalgebra/pca/PCAFilteredAutotuningRunner.java
new file mode 100644
index 00000000..8b53dc43
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/linearalgebra/pca/PCAFilteredAutotuningRunner.java
@@ -0,0 +1,232 @@
+package de.lmu.ifi.dbs.elki.math.linearalgebra.pca;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair;
+import de.lmu.ifi.dbs.elki.database.query.DoubleDistanceResultPair;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.EigenPair;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.EigenvalueDecomposition;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.SortedEigenPairs;
+import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+
+/**
+ * Performs a self-tuning local PCA based on the covariance matrices of given
+ * objects. At most the closest 'k' points are used in the calculation and a
+ * weight function is applied.
+ *
+ * The number of points actually used depends on when the strong eigenvectors
+ * exhibit the clearest correlation.
+ *
+ * @author Erich Schubert
+ * @param <V> vector type
+ */
+@Reference(authors = "H.-P. Kriegel, P. Kröger, E. Schubert, A. Zimek", title = "A General Framework for Increasing the Robustness of PCA-based Correlation Clustering Algorithms", booktitle = "Proceedings of the 20th International Conference on Scientific and Statistical Database Management (SSDBM), Hong Kong, China, 2008", url = "http://dx.doi.org/10.1007/978-3-540-69497-7_27")
+public class PCAFilteredAutotuningRunner<V extends NumberVector<? extends V, ?>> extends PCAFilteredRunner<V> {
+ /**
+ * Constructor.
+ *
+ * @param covarianceMatrixBuilder
+ * @param eigenPairFilter
+ * @param big
+ * @param small
+ */
+ public PCAFilteredAutotuningRunner(CovarianceMatrixBuilder<V> covarianceMatrixBuilder, EigenPairFilter eigenPairFilter, double big, double small) {
+ super(covarianceMatrixBuilder, eigenPairFilter, big, small);
+ }
+
+ @Override
+ public PCAFilteredResult processIds(DBIDs ids, Relation<? extends V> database) {
+ // Assume Euclidean distance. In the context of PCA, the neighborhood should
+ // be L2-spherical to be unbiased.
+ V center = DatabaseUtil.centroid(database, ids);
+ List<DoubleDistanceResultPair> dres = new ArrayList<DoubleDistanceResultPair>(ids.size());
+ for(DBID id : ids) {
+ final double dist = EuclideanDistanceFunction.STATIC.doubleDistance(center, database.get(id));
+ dres.add(new DoubleDistanceResultPair(dist, id));
+ }
+ Collections.sort(dres);
+ return processQueryResult(dres, database);
+ }
+
+ @Override
+ public <D extends NumberDistance<?, ?>> PCAFilteredResult processQueryResult(Collection<? extends DistanceResultPair<D>> results, Relation<? extends V> database) {
+ assertSortedByDistance(results);
+ final int dim = DatabaseUtil.dimensionality(database);
+
+ List<Matrix> best = new LinkedList<Matrix>();
+ for(int i = 0; i < dim; i++) {
+ best.add(null);
+ }
+ double[] beststrength = new double[dim];
+ for(int i = 0; i < dim; i++) {
+ beststrength[i] = -1;
+ }
+ int[] bestk = new int[dim];
+ // 'history'
+ LinkedList<Matrix> prevM = new LinkedList<Matrix>();
+ LinkedList<Double> prevS = new LinkedList<Double>();
+ LinkedList<Integer> prevD = new LinkedList<Integer>();
+ // TODO: starting parameter shouldn't be hardcoded...
+ int smooth = 3;
+ int startk = 4;
+ if(startk > results.size() - 1) {
+ startk = results.size() - 1;
+ }
+ // TODO: add smoothing options, handle border cases better.
+ for(int k = startk; k < results.size(); k++) {
+ // sorted eigenpairs, eigenvectors, eigenvalues
+ Matrix covMat = covarianceMatrixBuilder.processQueryResults(results, database);
+ EigenvalueDecomposition evd = new EigenvalueDecomposition(covMat);
+ SortedEigenPairs eigenPairs = new SortedEigenPairs(evd, false);
+ FilteredEigenPairs filteredEigenPairs = getEigenPairFilter().filter(eigenPairs);
+
+ // correlationDimension = #strong EV
+ int thisdim = filteredEigenPairs.countStrongEigenPairs();
+
+ // FIXME: handle the case of no strong EVs.
+ assert ((thisdim > 0) && (thisdim <= dim));
+ double thisexplain = computeExplainedVariance(filteredEigenPairs);
+
+ prevM.add(covMat);
+ prevS.add(thisexplain);
+ prevD.add(thisdim);
+ assert (prevS.size() == prevM.size());
+ assert (prevS.size() == prevD.size());
+
+ if(prevS.size() >= 2 * smooth + 1) {
+ // all the same dimension?
+ boolean samedim = true;
+ for(Iterator<Integer> it = prevD.iterator(); it.hasNext();) {
+ if(it.next().intValue() != thisdim) {
+ samedim = false;
+ }
+ }
+ if(samedim) {
+ // average their explain values
+ double avgexplain = 0.0;
+ for(Iterator<Double> it = prevS.iterator(); it.hasNext();) {
+ avgexplain += it.next().doubleValue();
+ }
+ avgexplain /= prevS.size();
+
+ if(avgexplain > beststrength[thisdim - 1]) {
+ beststrength[thisdim - 1] = avgexplain;
+ best.set(thisdim - 1, prevM.get(smooth));
+ bestk[thisdim - 1] = k - smooth;
+ }
+ }
+ prevM.removeFirst();
+ prevS.removeFirst();
+ prevD.removeFirst();
+ assert (prevS.size() == prevM.size());
+ assert (prevS.size() == prevD.size());
+ }
+ }
+ // Try all dimensions, lowest first.
+ for(int i = 0; i < dim; i++) {
+ if(beststrength[i] > 0.0) {
+ // If the best was the lowest or the biggest k, skip it!
+ if(bestk[i] == startk + smooth) {
+ continue;
+ }
+ if(bestk[i] == results.size() - smooth - 1) {
+ continue;
+ }
+ Matrix covMat = best.get(i);
+
+ // We stop at the lowest dimension that did the job for us.
+ // System.err.println("Auto-k: "+bestk[i]+" dim: "+(i+1));
+ return processCovarMatrix(covMat);
+ }
+ }
+ // NOTE: if we didn't get a 'maximum' anywhere, we end up with the data from
+ // the last run of the loop above. I.e. PCA on the full data set. That is
+ // intended.
+ return processCovarMatrix(covarianceMatrixBuilder.processQueryResults(results, database));
+ }
+
+ /**
+ * Compute the explained variance for a FilteredEigenPairs
+ *
+ * @param filteredEigenPairs
+ * @return explained variance by the strong eigenvectors.
+ */
+ private double computeExplainedVariance(FilteredEigenPairs filteredEigenPairs) {
+ double strongsum = 0.0;
+ double weaksum = 0.0;
+ for(EigenPair ep : filteredEigenPairs.getStrongEigenPairs()) {
+ strongsum += ep.getEigenvalue();
+ }
+ for(EigenPair ep : filteredEigenPairs.getWeakEigenPairs()) {
+ weaksum += ep.getEigenvalue();
+ }
+ return strongsum / (strongsum / weaksum);
+ }
+
+ /**
+ * Ensure that the results are sorted by distance.
+ *
+ * @param results
+ */
+ private <D extends NumberDistance<?, ?>> void assertSortedByDistance(Collection<? extends DistanceResultPair<D>> results) {
+ // TODO: sort results instead?
+ double dist = -1.0;
+ for(Iterator<? extends DistanceResultPair<D>> it = results.iterator(); it.hasNext();) {
+ double qr = it.next().getDistance().doubleValue();
+ if(qr < dist) {
+ System.err.println("WARNING: results not sorted by distance!");
+ }
+ dist = qr;
+ }
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<? extends V, ?>> extends PCAFilteredRunner.Parameterizer<V> {
+ @Override
+ protected PCAFilteredAutotuningRunner<V> makeInstance() {
+ return new PCAFilteredAutotuningRunner<V>(covarianceMatrixBuilder, eigenPairFilter, big, small);
+ }
+ }
+} \ No newline at end of file