diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/math/statistics/dependence/CorrelationDependenceMeasure.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/math/statistics/dependence/CorrelationDependenceMeasure.java | 141 |
1 files changed, 141 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/CorrelationDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/CorrelationDependenceMeasure.java new file mode 100644 index 00000000..e03601f7 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/CorrelationDependenceMeasure.java @@ -0,0 +1,141 @@ +package de.lmu.ifi.dbs.elki.math.statistics.dependence; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.List; + +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * Pearson product-moment correlation coefficient. + * + * @author Erich Schubert + */ +public class CorrelationDependenceMeasure extends AbstractDependenceMeasure { + /** + * Class logger. + */ + private static final Logging LOG = Logging.getLogger(CorrelationDependenceMeasure.class); + + /** + * Static instance. + */ + public static final CorrelationDependenceMeasure STATIC = new CorrelationDependenceMeasure(); + + /** + * Constructor - use {@link #STATIC} instance. + */ + protected CorrelationDependenceMeasure() { + super(); + } + + @Override + public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) { + final int len = size(adapter1, data1, adapter2, data2); + // Perform two-pass estimation, which is numerically stable and often faster + // than the Knuth-Welford approach (see PearsonCorrelation class) + double m1 = 0., m2 = 0; + for(int i = 0; i < len; i++) { + m1 += adapter1.getDouble(data1, i); + m2 += adapter2.getDouble(data2, i); + } + m1 /= len; + m2 /= len; + // Second pass: variances and covariance + double v1 = 0., v2 = 0., cov = 0.; + for(int i = 0; i < len; i++) { + double d1 = adapter1.getDouble(data1, i) - m1; + double d2 = adapter2.getDouble(data2, i) - m2; + v1 += d1 * d1; + v2 += d2 * d2; + cov += d1 * d2; + } + // Note: we did not normalize by len, as this cancels out. + return cov / Math.sqrt(v1 * v2); + } + + @Override + public <A> double[] dependence(NumberArrayAdapter<?, A> adapter, List<? extends A> data) { + final int dims = data.size(); + final int len = size(adapter, data); + double[] means = new double[dims]; + // Two passes - often faster due to the lower numerical cost + // And accurate, don't use sum-of-squares. + for(int j = 0; j < dims; j++) { + double m = 0.; + A da = data.get(j); + for(int i = 0; i < len; i++) { + m += adapter.getDouble(da, i); + } + means[j] = m / len; + } + // Build the covariance matrix, lower triangular half + double[] vst = new double[dims]; + double[] cov = new double[(dims * (dims - 1)) >> 1]; + double[] buf = new double[dims]; + for(int i = 0; i < len; i++) { + for(int j = 0; j < dims; j++) { + buf[j] = adapter.getDouble(data.get(j), i) - means[j]; + } + for(int y = 0, c = 0; y < dims; y++) { + for(int x = 0; x < y; x++) { + cov[c++] += buf[x] * buf[y]; + } + vst[y] += buf[y] * buf[y]; + } + } + // Compute standard deviations (times sqrt(len)!): + for(int y = 0; y < dims; y++) { + if(vst[y] == 0.) { + LOG.warning("Correlation is not well defined for constant attributes."); + } + vst[y] = Math.sqrt(vst[y]); + } + for(int y = 1, c = 0; y < dims; y++) { + for(int x = 0; x < y; x++) { + // We don't need to divide by sqrt(len), because it will cancel out with + // the division we skipped just above. + cov[c] = cov[c] / (vst[x] * vst[y]); + c++; + } + } + return cov; + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + @Override + protected CorrelationDependenceMeasure makeInstance() { + return STATIC; + } + } +} |