summaryrefslogtreecommitdiff
path: root/elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/filter/transform/FastMultidimensionalScalingTransform.java
diff options
context:
space:
mode:
Diffstat (limited to 'elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/filter/transform/FastMultidimensionalScalingTransform.java')
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/filter/transform/FastMultidimensionalScalingTransform.java361
1 files changed, 361 insertions, 0 deletions
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/filter/transform/FastMultidimensionalScalingTransform.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/filter/transform/FastMultidimensionalScalingTransform.java
new file mode 100644
index 00000000..121e500b
--- /dev/null
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/filter/transform/FastMultidimensionalScalingTransform.java
@@ -0,0 +1,361 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.transform;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2015
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.List;
+import java.util.Random;
+
+import de.lmu.ifi.dbs.elki.data.DoubleVector;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
+import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Rescale the data set using multidimensional scaling, MDS.
+ *
+ * This implementation uses power iterations, which is faster when the number of
+ * data points is much larger than the desired number of dimensions.
+ *
+ * This implementation is O(n^2), and uses O(n^2) memory.
+ *
+ * @author Erich Schubert
+ *
+ * @param <O> Data type
+ */
+@Alias({ "fastmds" })
+public class FastMultidimensionalScalingTransform<O> implements ObjectFilter {
+ /**
+ * Class logger.
+ */
+ private static final Logging LOG = Logging.getLogger(FastMultidimensionalScalingTransform.class);
+
+ /**
+ * Distance function to use.
+ */
+ PrimitiveDistanceFunction<? super O> dist = null;
+
+ /**
+ * Target dimensionality
+ */
+ int tdim;
+
+ /**
+ * Constructor.
+ *
+ * @param tdim Target dimensionality.
+ * @param dist Distance function to use.
+ */
+ public FastMultidimensionalScalingTransform(int tdim, PrimitiveDistanceFunction<? super O> dist) {
+ super();
+ this.tdim = tdim;
+ this.dist = dist;
+ }
+
+ @Override
+ public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
+ final int size = objects.dataLength();
+ if(size == 0) {
+ return objects;
+ }
+ MultipleObjectsBundle bundle = new MultipleObjectsBundle();
+
+ for(int r = 0; r < objects.metaLength(); r++) {
+ SimpleTypeInformation<? extends Object> type = objects.meta(r);
+ @SuppressWarnings("unchecked")
+ final List<Object> column = (List<Object>) objects.getColumn(r);
+ // Not supported column (e.g. labels):
+ if(!dist.getInputTypeRestriction().isAssignableFromType(type)) {
+ bundle.appendColumn(type, column);
+ continue;
+ }
+ // Get the replacement type information
+ @SuppressWarnings("unchecked")
+ final List<O> castColumn = (List<O>) column;
+ NumberVector.Factory<? extends NumberVector> factory = null;
+ {
+ if(type instanceof VectorFieldTypeInformation) {
+ final VectorFieldTypeInformation<?> ctype = (VectorFieldTypeInformation<?>) type;
+ // Note two-step cast, to make stricter compilers happy.
+ @SuppressWarnings("unchecked")
+ final VectorFieldTypeInformation<? extends NumberVector> vtype = (VectorFieldTypeInformation<? extends NumberVector>) ctype;
+ factory = FilterUtil.guessFactory(vtype);
+ }
+ else {
+ factory = DoubleVector.FACTORY;
+ }
+ bundle.appendColumn(new VectorFieldTypeInformation<>(factory, tdim), castColumn);
+ }
+
+ // Compute distance matrix.
+ double[][] imat = computeDistanceMatrix(castColumn);
+ ClassicMultidimensionalScalingTransform.doubleCenterSymmetric(imat);
+ // Find eigenvectors.
+ {
+ double[][] evs = new double[tdim][size];
+ double[] lambda = new double[tdim];
+ findEigenVectors(imat, evs, lambda);
+
+ // Project each data point to the new coordinates.
+ double[] buf = new double[tdim];
+ for(int i = 0; i < size; i++) {
+ for(int d = 0; d < tdim; d++) {
+ buf[d] = lambda[d] * evs[d][i];
+ }
+ column.set(i, factory.newNumberVector(buf));
+ }
+ }
+ }
+ return bundle;
+ }
+
+ /**
+ * Compute the distance matrix of a vector column.
+ *
+ * @param col Column
+ * @return Distance matrix
+ */
+ protected double[][] computeDistanceMatrix(final List<O> col) {
+ final int size = col.size();
+ double[][] imat = new double[size][size];
+ boolean squared = dist instanceof SquaredEuclideanDistanceFunction;
+ FiniteProgress dprog = LOG.isVerbose() ? new FiniteProgress("Computing distance matrix", (size * (size - 1)) >>> 1, LOG) : null;
+ for(int x = 0; x < size; x++) {
+ final O ox = col.get(x);
+ for(int y = x + 1; y < size; y++) {
+ final O oy = col.get(y);
+ double distance = dist.distance(ox, oy);
+ distance *= (squared ? -.5 : (-.5 * distance));
+ imat[x][y] = distance;
+ imat[y][x] = distance;
+ }
+ if(dprog != null) {
+ dprog.setProcessed(dprog.getProcessed() + size - x - 1, LOG);
+ }
+ }
+ LOG.ensureCompleted(dprog);
+ return imat;
+ }
+
+ /**
+ * Find the first eigenvectors and eigenvalues using power iterations.
+ *
+ * @param imat Matrix (will be modified!)
+ * @param evs Eigenvectors output
+ * @param lambda Eigenvalues output
+ */
+ protected void findEigenVectors(double[][] imat, double[][] evs, double[] lambda) {
+ final int size = imat.length;
+ Random rnd = new Random(); // FIXME: make parameterizable
+ double[] tmp = new double[size];
+ FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Learning projections", tdim, LOG) : null;
+ for(int d = 0; d < tdim;) {
+ final double[] cur = evs[d];
+ randomInitialization(cur, rnd);
+ double l = multiply(imat, cur, tmp);
+ for(int iter = 0; iter < 100; iter++) {
+ // This will scale "tmp" to unit length, and copy it to cur:
+ double delta = updateEigenvector(tmp, cur, l);
+ if(delta < 1e-10) {
+ break;
+ }
+ l = multiply(imat, cur, tmp);
+ }
+ l = estimateEigenvalue(imat, cur);
+ lambda[d] = l;
+ d++; // Successful
+ LOG.incrementProcessed(prog);
+ if(d == tdim) {
+ break;
+ }
+ // Update matrix
+ updateMatrix(imat, cur, l);
+ }
+ LOG.ensureCompleted(prog);
+ }
+
+ /**
+ * Choose a random vector of unit norm for power iterations.
+ *
+ * @param out Output storage
+ * @param rnd Random source.
+ */
+ protected void randomInitialization(double[] out, Random rnd) {
+ double l2 = 0.;
+ while(!(l2 > 0)) {
+ for(int d = 0; d < out.length; d++) {
+ final double val = rnd.nextDouble();
+ out[d] = val;
+ l2 += val * val;
+ }
+ }
+ // If zero, retry. This should barely ever happen.
+ if(!(l2 > 0)) {
+ randomInitialization(out, rnd);
+ return;
+ }
+ // Standardize:
+ final double s = 1. / Math.sqrt(l2);
+ for(int d = 0; d < out.length; d++) {
+ out[d] *= s;
+ }
+ }
+
+ /**
+ * Compute out = A * in, and return the (signed!) length of the output vector.
+ *
+ * @param mat Matrix.
+ * @param in Input vector.
+ * @param out Output vector storage.
+ * @return Length of this vector.
+ */
+ protected double multiply(double[][] mat, double[] in, double[] out) {
+ double l = 0.;
+ // Matrix multiplication:
+ for(int d1 = 0; d1 < in.length; d1++) {
+ final double[] row = mat[d1];
+ double t = 0.;
+ for(int d2 = 0; d2 < in.length; d2++) {
+ t += row[d2] * in[d2];
+ }
+ out[d1] = t;
+ l += t * t;
+ }
+ return l > 0 ? Math.sqrt(l) : 0.;
+ }
+
+ /**
+ * Compute the change in the eigenvector, and normalize the output vector
+ * while doing so.
+ *
+ * @param in Input vector
+ * @param out Output vector
+ * @param l Eigenvalue
+ * @return Change
+ */
+ protected double updateEigenvector(double[] in, double[] out, double l) {
+ double s = 1. / (l > 0. ? l : l < 0. ? -l : 1.);
+ s = (in[0] > 0.) ? s : -s; // Reduce flipping vectors
+ double diff = 0.;
+ for(int d = 0; d < in.length; d++) {
+ in[d] *= s; // Scale to unit length
+ // Compute change from previous iteration:
+ double delta = in[d] - out[d];
+ diff += delta * delta;
+ out[d] = in[d]; // Update output storage
+ }
+ return diff;
+ }
+
+ /**
+ * Estimate the (singed!) Eigenvalue for a particular vector.
+ *
+ * @param mat Matrix.
+ * @param in Input vector.
+ * @return Estimated eigenvalue
+ */
+ protected double estimateEigenvalue(double[][] mat, double[] in) {
+ double de = 0., di = 0.;
+ // Matrix multiplication:
+ for(int d1 = 0; d1 < in.length; d1++) {
+ final double[] row = mat[d1];
+ double t = 0.;
+ for(int d2 = 0; d2 < in.length; d2++) {
+ t += row[d2] * in[d2];
+ }
+ final double s = in[d1];
+ de += t * s;
+ di += s * s;
+ }
+ return de / di;
+ }
+
+ /**
+ * Update matrix, by removing the effects of a known Eigenvector.
+ *
+ * @param mat Matrix
+ * @param evec Known normalized Eigenvector
+ * @param eval Eigenvalue
+ */
+ protected void updateMatrix(double[][] mat, final double[] evec, double eval) {
+ final int size = mat.length;
+ for(int i = 0; i < size; i++) {
+ final double[] mati = mat[i];
+ final double eveci = evec[i];
+ for(int j = 0; j < size; j++) {
+ mati[j] -= eval * eveci * evec[j];
+ }
+ }
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<O extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Target dimensionality.
+ */
+ int tdim;
+
+ /**
+ * Distance function to use.
+ */
+ PrimitiveDistanceFunction<? super O> dist = null;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+
+ IntParameter dimP = new IntParameter(ClassicMultidimensionalScalingTransform.Parameterizer.DIM_ID);
+ if(config.grab(dimP)) {
+ tdim = dimP.intValue();
+ }
+
+ ObjectParameter<PrimitiveDistanceFunction<? super O>> distP = new ObjectParameter<>(ClassicMultidimensionalScalingTransform.Parameterizer.DISTANCE_ID, PrimitiveDistanceFunction.class, SquaredEuclideanDistanceFunction.class);
+ if(config.grab(distP)) {
+ dist = distP.instantiateClass(config);
+ }
+ }
+
+ @Override
+ protected FastMultidimensionalScalingTransform<O> makeInstance() {
+ return new FastMultidimensionalScalingTransform<>(tdim, dist);
+ }
+ }
+}