diff options
Diffstat (limited to 'elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/filter/transform/FastMultidimensionalScalingTransform.java')
-rw-r--r-- | elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/filter/transform/FastMultidimensionalScalingTransform.java | 361 |
1 files changed, 361 insertions, 0 deletions
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/filter/transform/FastMultidimensionalScalingTransform.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/filter/transform/FastMultidimensionalScalingTransform.java new file mode 100644 index 00000000..121e500b --- /dev/null +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/filter/transform/FastMultidimensionalScalingTransform.java @@ -0,0 +1,361 @@ +package de.lmu.ifi.dbs.elki.datasource.filter.transform; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2015 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.List; +import java.util.Random; + +import de.lmu.ifi.dbs.elki.data.DoubleVector; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; +import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; +import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; +import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil; +import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.utilities.Alias; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Rescale the data set using multidimensional scaling, MDS. + * + * This implementation uses power iterations, which is faster when the number of + * data points is much larger than the desired number of dimensions. + * + * This implementation is O(n^2), and uses O(n^2) memory. + * + * @author Erich Schubert + * + * @param <O> Data type + */ +@Alias({ "fastmds" }) +public class FastMultidimensionalScalingTransform<O> implements ObjectFilter { + /** + * Class logger. + */ + private static final Logging LOG = Logging.getLogger(FastMultidimensionalScalingTransform.class); + + /** + * Distance function to use. + */ + PrimitiveDistanceFunction<? super O> dist = null; + + /** + * Target dimensionality + */ + int tdim; + + /** + * Constructor. + * + * @param tdim Target dimensionality. + * @param dist Distance function to use. + */ + public FastMultidimensionalScalingTransform(int tdim, PrimitiveDistanceFunction<? super O> dist) { + super(); + this.tdim = tdim; + this.dist = dist; + } + + @Override + public MultipleObjectsBundle filter(MultipleObjectsBundle objects) { + final int size = objects.dataLength(); + if(size == 0) { + return objects; + } + MultipleObjectsBundle bundle = new MultipleObjectsBundle(); + + for(int r = 0; r < objects.metaLength(); r++) { + SimpleTypeInformation<? extends Object> type = objects.meta(r); + @SuppressWarnings("unchecked") + final List<Object> column = (List<Object>) objects.getColumn(r); + // Not supported column (e.g. labels): + if(!dist.getInputTypeRestriction().isAssignableFromType(type)) { + bundle.appendColumn(type, column); + continue; + } + // Get the replacement type information + @SuppressWarnings("unchecked") + final List<O> castColumn = (List<O>) column; + NumberVector.Factory<? extends NumberVector> factory = null; + { + if(type instanceof VectorFieldTypeInformation) { + final VectorFieldTypeInformation<?> ctype = (VectorFieldTypeInformation<?>) type; + // Note two-step cast, to make stricter compilers happy. + @SuppressWarnings("unchecked") + final VectorFieldTypeInformation<? extends NumberVector> vtype = (VectorFieldTypeInformation<? extends NumberVector>) ctype; + factory = FilterUtil.guessFactory(vtype); + } + else { + factory = DoubleVector.FACTORY; + } + bundle.appendColumn(new VectorFieldTypeInformation<>(factory, tdim), castColumn); + } + + // Compute distance matrix. + double[][] imat = computeDistanceMatrix(castColumn); + ClassicMultidimensionalScalingTransform.doubleCenterSymmetric(imat); + // Find eigenvectors. + { + double[][] evs = new double[tdim][size]; + double[] lambda = new double[tdim]; + findEigenVectors(imat, evs, lambda); + + // Project each data point to the new coordinates. + double[] buf = new double[tdim]; + for(int i = 0; i < size; i++) { + for(int d = 0; d < tdim; d++) { + buf[d] = lambda[d] * evs[d][i]; + } + column.set(i, factory.newNumberVector(buf)); + } + } + } + return bundle; + } + + /** + * Compute the distance matrix of a vector column. + * + * @param col Column + * @return Distance matrix + */ + protected double[][] computeDistanceMatrix(final List<O> col) { + final int size = col.size(); + double[][] imat = new double[size][size]; + boolean squared = dist instanceof SquaredEuclideanDistanceFunction; + FiniteProgress dprog = LOG.isVerbose() ? new FiniteProgress("Computing distance matrix", (size * (size - 1)) >>> 1, LOG) : null; + for(int x = 0; x < size; x++) { + final O ox = col.get(x); + for(int y = x + 1; y < size; y++) { + final O oy = col.get(y); + double distance = dist.distance(ox, oy); + distance *= (squared ? -.5 : (-.5 * distance)); + imat[x][y] = distance; + imat[y][x] = distance; + } + if(dprog != null) { + dprog.setProcessed(dprog.getProcessed() + size - x - 1, LOG); + } + } + LOG.ensureCompleted(dprog); + return imat; + } + + /** + * Find the first eigenvectors and eigenvalues using power iterations. + * + * @param imat Matrix (will be modified!) + * @param evs Eigenvectors output + * @param lambda Eigenvalues output + */ + protected void findEigenVectors(double[][] imat, double[][] evs, double[] lambda) { + final int size = imat.length; + Random rnd = new Random(); // FIXME: make parameterizable + double[] tmp = new double[size]; + FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Learning projections", tdim, LOG) : null; + for(int d = 0; d < tdim;) { + final double[] cur = evs[d]; + randomInitialization(cur, rnd); + double l = multiply(imat, cur, tmp); + for(int iter = 0; iter < 100; iter++) { + // This will scale "tmp" to unit length, and copy it to cur: + double delta = updateEigenvector(tmp, cur, l); + if(delta < 1e-10) { + break; + } + l = multiply(imat, cur, tmp); + } + l = estimateEigenvalue(imat, cur); + lambda[d] = l; + d++; // Successful + LOG.incrementProcessed(prog); + if(d == tdim) { + break; + } + // Update matrix + updateMatrix(imat, cur, l); + } + LOG.ensureCompleted(prog); + } + + /** + * Choose a random vector of unit norm for power iterations. + * + * @param out Output storage + * @param rnd Random source. + */ + protected void randomInitialization(double[] out, Random rnd) { + double l2 = 0.; + while(!(l2 > 0)) { + for(int d = 0; d < out.length; d++) { + final double val = rnd.nextDouble(); + out[d] = val; + l2 += val * val; + } + } + // If zero, retry. This should barely ever happen. + if(!(l2 > 0)) { + randomInitialization(out, rnd); + return; + } + // Standardize: + final double s = 1. / Math.sqrt(l2); + for(int d = 0; d < out.length; d++) { + out[d] *= s; + } + } + + /** + * Compute out = A * in, and return the (signed!) length of the output vector. + * + * @param mat Matrix. + * @param in Input vector. + * @param out Output vector storage. + * @return Length of this vector. + */ + protected double multiply(double[][] mat, double[] in, double[] out) { + double l = 0.; + // Matrix multiplication: + for(int d1 = 0; d1 < in.length; d1++) { + final double[] row = mat[d1]; + double t = 0.; + for(int d2 = 0; d2 < in.length; d2++) { + t += row[d2] * in[d2]; + } + out[d1] = t; + l += t * t; + } + return l > 0 ? Math.sqrt(l) : 0.; + } + + /** + * Compute the change in the eigenvector, and normalize the output vector + * while doing so. + * + * @param in Input vector + * @param out Output vector + * @param l Eigenvalue + * @return Change + */ + protected double updateEigenvector(double[] in, double[] out, double l) { + double s = 1. / (l > 0. ? l : l < 0. ? -l : 1.); + s = (in[0] > 0.) ? s : -s; // Reduce flipping vectors + double diff = 0.; + for(int d = 0; d < in.length; d++) { + in[d] *= s; // Scale to unit length + // Compute change from previous iteration: + double delta = in[d] - out[d]; + diff += delta * delta; + out[d] = in[d]; // Update output storage + } + return diff; + } + + /** + * Estimate the (singed!) Eigenvalue for a particular vector. + * + * @param mat Matrix. + * @param in Input vector. + * @return Estimated eigenvalue + */ + protected double estimateEigenvalue(double[][] mat, double[] in) { + double de = 0., di = 0.; + // Matrix multiplication: + for(int d1 = 0; d1 < in.length; d1++) { + final double[] row = mat[d1]; + double t = 0.; + for(int d2 = 0; d2 < in.length; d2++) { + t += row[d2] * in[d2]; + } + final double s = in[d1]; + de += t * s; + di += s * s; + } + return de / di; + } + + /** + * Update matrix, by removing the effects of a known Eigenvector. + * + * @param mat Matrix + * @param evec Known normalized Eigenvector + * @param eval Eigenvalue + */ + protected void updateMatrix(double[][] mat, final double[] evec, double eval) { + final int size = mat.length; + for(int i = 0; i < size; i++) { + final double[] mati = mat[i]; + final double eveci = evec[i]; + for(int j = 0; j < size; j++) { + mati[j] -= eval * eveci * evec[j]; + } + } + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<O extends NumberVector> extends AbstractParameterizer { + /** + * Target dimensionality. + */ + int tdim; + + /** + * Distance function to use. + */ + PrimitiveDistanceFunction<? super O> dist = null; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + + IntParameter dimP = new IntParameter(ClassicMultidimensionalScalingTransform.Parameterizer.DIM_ID); + if(config.grab(dimP)) { + tdim = dimP.intValue(); + } + + ObjectParameter<PrimitiveDistanceFunction<? super O>> distP = new ObjectParameter<>(ClassicMultidimensionalScalingTransform.Parameterizer.DISTANCE_ID, PrimitiveDistanceFunction.class, SquaredEuclideanDistanceFunction.class); + if(config.grab(distP)) { + dist = distP.instantiateClass(config); + } + } + + @Override + protected FastMultidimensionalScalingTransform<O> makeInstance() { + return new FastMultidimensionalScalingTransform<>(tdim, dist); + } + } +} |