diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/math/statistics')
15 files changed, 1918 insertions, 38 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/BetaDistribution.java b/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/BetaDistribution.java new file mode 100644 index 00000000..1efe19dd --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/BetaDistribution.java @@ -0,0 +1,499 @@ +package de.lmu.ifi.dbs.elki.math.statistics.distribution;
+
+import java.util.Random;
+
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * Beta Distribution with implementation of the regularized incomplete beta
+ * function
+ *
+ * @author Jan Brusis
+ * @author Erich Schubert
+ */
+public class BetaDistribution implements DistributionWithRandom {
+ /**
+ * Numerical precision to use
+ */
+ static final double NUM_PRECISION = 1E-15;
+
+ /**
+ * Limit of when to switch to quadrature method
+ */
+ static final double SWITCH = 3000;
+
+ /**
+ * Abscissas for Gauss-Legendre quadrature
+ */
+ static final double[] GAUSSLEGENDRE_Y = { 0.0021695375159141994, 0.011413521097787704, 0.027972308950302116, 0.051727015600492421, 0.082502225484340941, 0.12007019910960293, 0.16415283300752470, 0.21442376986779355, 0.27051082840644336, 0.33199876341447887, 0.39843234186401943, 0.46931971407375483, 0.54413605556657973, 0.62232745288031077, 0.70331500465597174, 0.78649910768313447, 0.87126389619061517, 0.95698180152629142 };
+
+ /**
+ * Weights for Gauss-Legendre quadrature
+ */
+ static final double[] GAUSSLEGENDRE_W = { 0.0055657196642445571, 0.012915947284065419, 0.020181515297735382, 0.027298621498568734, 0.034213810770299537, 0.040875750923643261, 0.047235083490265582, 0.053244713977759692, 0.058860144245324798, 0.064039797355015485, 0.068745323835736408, 0.072941885005653087, 0.076598410645870640, 0.079687828912071670, 0.082187266704339706, 0.084078218979661945, 0.085346685739338721, 0.085983275670394821 };
+
+ /**
+ * Shape parameter of beta distribution
+ */
+ private final double alpha;
+
+ /**
+ * Shape parameter of beta distribution
+ */
+ private final double beta;
+
+ /**
+ * For random number generation
+ */
+ private Random random;
+
+ /**
+ * Log beta(a, b) cache
+ */
+ private double logbab;
+
+ /**
+ * Constructor.
+ *
+ * @param a shape Parameter a
+ * @param b shape Parameter b
+ */
+ public BetaDistribution(double a, double b) {
+ this(a, b, new Random());
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param a shape Parameter a
+ * @param b shape Parameter b
+ * @param random Random generator
+ */
+ public BetaDistribution(double a, double b, Random random) {
+ super();
+ if(a <= 0.0 || b <= 0.0) {
+ throw new IllegalArgumentException("Invalid parameters for Beta distribution.");
+ }
+
+ this.alpha = a;
+ this.beta = b;
+ this.logbab = logBeta(a, b);
+ this.random = random;
+ }
+
+ @Override
+ public double pdf(double val) {
+ if(val < 0. || val > 1.) {
+ return 0.;
+ }
+ if(val == 0.) {
+ if(alpha > 1.) {
+ return 0.;
+ }
+ if(alpha < 1.) {
+ return Double.POSITIVE_INFINITY;
+ }
+ return beta;
+ }
+ if(val == 1.) {
+ if(beta > 1.) {
+ return 0.;
+ }
+ if(beta < 1.) {
+ return Double.POSITIVE_INFINITY;
+ }
+ return alpha;
+ }
+ return Math.exp(-logbab + Math.log(val) * (alpha - 1) + Math.log1p(-val) * (beta - 1));
+ }
+
+ @Override
+ public double cdf(double x) {
+ if(alpha <= 0.0 || beta <= 0.0 || Double.isNaN(alpha) || Double.isNaN(beta) || Double.isNaN(x)) {
+ return Double.NaN;
+ }
+ if(x <= 0.0) {
+ return 0.0;
+ }
+ if(x >= 1.0) {
+ return 1.0;
+ }
+ if(alpha > SWITCH && beta > SWITCH) {
+ return regularizedIncBetaQuadrature(alpha, beta, x);
+ }
+ double bt = Math.exp(-logbab + alpha * Math.log(x) + beta * Math.log1p(-x));
+ if(x < (alpha + 1.0) / (alpha + beta + 2.0)) {
+ return bt * regularizedIncBetaCF(alpha, beta, x) / alpha;
+ }
+ else {
+ return 1.0 - bt * regularizedIncBetaCF(beta, alpha, 1.0 - x) / beta;
+ }
+ }
+
+ @Override
+ public double quantile(double x) {
+ // Valid parameters
+ if(x < 0 || x > 1 || Double.isNaN(x)) {
+ return Double.NaN;
+ }
+ if(x == 0) {
+ return 0.0;
+ }
+ if(x == 1) {
+ return 1.0;
+ }
+ // Simpler to compute inverse?
+ if(x > 0.5) {
+ return 1 - rawQuantile(1 - x, beta, alpha, logbab);
+ }
+ else {
+ return rawQuantile(x, alpha, beta, logbab);
+ }
+ }
+
+ @Override
+ public double nextRandom() {
+ double x = GammaDistribution.nextRandom(alpha, 1, random);
+ double y = GammaDistribution.nextRandom(beta, 1, random);
+ return x / (x + y);
+ }
+
+ @Override
+ public String toString() {
+ return "BetaDistribution(alpha=" + alpha + ", beta=" + beta + ")";
+ }
+
+ /**
+ * Static version of the CDF of the beta distribution
+ *
+ * @param val Value
+ * @param alpha Shape parameter a
+ * @param beta Shape parameter b
+ * @return cumulative density
+ */
+ public static double cdf(double val, double alpha, double beta) {
+ return regularizedIncBeta(val, alpha, beta);
+ }
+
+ /**
+ * Static version of the PDF of the beta distribution
+ *
+ * @param val Value
+ * @param alpha Shape parameter a
+ * @param beta Shape parameter b
+ * @return probability density
+ */
+ public static double pdf(double val, double alpha, double beta) {
+ if(alpha <= 0. || beta <= 0. || Double.isNaN(alpha) || Double.isNaN(beta) || Double.isNaN(val)) {
+ return Double.NaN;
+ }
+ if(val < 0. || val > 1.) {
+ return 0.;
+ }
+ if(val == 0.) {
+ if(alpha > 1.) {
+ return 0.;
+ }
+ if(alpha < 1.) {
+ return Double.POSITIVE_INFINITY;
+ }
+ return beta;
+ }
+ if(val == 1.) {
+ if(beta > 1.) {
+ return 0.;
+ }
+ if(beta < 1.) {
+ return Double.POSITIVE_INFINITY;
+ }
+ return alpha;
+ }
+ return Math.exp(-logBeta(alpha, beta) + Math.log(val) * (alpha - 1) + Math.log1p(-val) * (beta - 1));
+ }
+
+ /**
+ * Compute log beta(a,b)
+ *
+ * @param alpha Shape parameter a
+ * @param beta Shape parameter b
+ * @return Logarithm of result
+ */
+ public static double logBeta(double alpha, double beta) {
+ return GammaDistribution.logGamma(alpha) + GammaDistribution.logGamma(beta) - GammaDistribution.logGamma(alpha + beta);
+ }
+
+ /**
+ * Computes the regularized incomplete beta function I_x(a, b) which is also
+ * the CDF of the beta distribution. Based on the book "Numerical Recipes"
+ *
+ * @param alpha Parameter a
+ * @param beta Parameter b
+ * @param x Parameter x
+ * @return Value of the regularized incomplete beta function
+ */
+ public static double regularizedIncBeta(double x, double alpha, double beta) {
+ if(alpha <= 0.0 || beta <= 0.0 || Double.isNaN(alpha) || Double.isNaN(beta) || Double.isNaN(x)) {
+ return Double.NaN;
+ }
+ if(x <= 0.0) {
+ return 0.0;
+ }
+ if(x >= 1.0) {
+ return 1.0;
+ }
+ if(alpha > SWITCH && beta > SWITCH) {
+ return regularizedIncBetaQuadrature(alpha, beta, x);
+ }
+ double bt = Math.exp(-logBeta(alpha, beta) + alpha * Math.log(x) + beta * Math.log1p(-x));
+ if(x < (alpha + 1.0) / (alpha + beta + 2.0)) {
+ return bt * regularizedIncBetaCF(alpha, beta, x) / alpha;
+ }
+ else {
+ return 1.0 - bt * regularizedIncBetaCF(beta, alpha, 1.0 - x) / beta;
+ }
+ }
+
+ /**
+ * Returns the regularized incomplete beta function I_x(a, b) Includes the
+ * continued fraction way of computing, based on the book "Numerical Recipes".
+ *
+ * @param alpha Parameter a
+ * @param beta Parameter b
+ * @param x Parameter x
+ * @return result
+ */
+ protected static double regularizedIncBetaCF(double alpha, double beta, double x) {
+ final double FPMIN = Double.MIN_VALUE / NUM_PRECISION;
+ double qab = alpha + beta;
+ double qap = alpha + 1.0;
+ double qam = alpha - 1.0;
+ double c = 1.0;
+ double d = 1.0 - qab * x / qap;
+ if(Math.abs(d) < FPMIN) {
+ d = FPMIN;
+ }
+ d = 1.0 / d;
+ double h = d;
+ for(int m = 1; m < 10000; m++) {
+ int m2 = 2 * m;
+ double aa = m * (beta - m) * x / ((qam + m2) * (alpha + m2));
+ d = 1.0 + aa * d;
+ if(Math.abs(d) < FPMIN) {
+ d = FPMIN;
+ }
+ c = 1.0 + aa / c;
+ if(Math.abs(c) < FPMIN) {
+ c = FPMIN;
+ }
+ d = 1.0 / d;
+ h *= d * c;
+ aa = -(alpha + m) * (qab + m) * x / ((alpha + m2) * (qap + m2));
+ d = 1.0 + aa * d;
+ if(Math.abs(d) < FPMIN) {
+ d = FPMIN;
+ }
+ c = 1.0 + aa / c;
+ if(Math.abs(c) < FPMIN) {
+ c = FPMIN;
+ }
+ d = 1.0 / d;
+ double del = d * c;
+ h *= del;
+ if(Math.abs(del - 1.0) <= NUM_PRECISION) {
+ break;
+ }
+ }
+ return h;
+ }
+
+ /**
+ * Returns the regularized incomplete beta function I_x(a, b) by quadrature,
+ * based on the book "Numerical Recipes".
+ *
+ * @param alpha Parameter a
+ * @param beta Parameter b
+ * @param x Parameter x
+ * @return result
+ */
+ protected static double regularizedIncBetaQuadrature(double alpha, double beta, double x) {
+ double a1 = alpha - 1.0;
+ double b1 = beta - 1.0;
+ double mu = alpha / (alpha + beta);
+ double lnmu = Math.log(mu);
+ double lnmuc = Math.log1p(-mu);
+ double t = Math.sqrt(alpha * beta / ((alpha + beta) * (alpha + beta) * (alpha + beta + 1.0)));
+ double xu;
+ if(x > alpha / (alpha + beta)) {
+ if(x >= 1.0) {
+ return 1.0;
+ }
+ xu = Math.min(1.0, Math.max(mu + 10.0 * t, x + 5.0 * t));
+ }
+ else {
+ if(x <= 0.0) {
+ return 0.0;
+ }
+ xu = Math.max(0.0, Math.min(mu - 10.0 * t, x - 5.0 * t));
+ }
+ double sum = 0.0;
+ for(int i = 0; i < GAUSSLEGENDRE_Y.length; i++) {
+ t = x + (xu - x) * GAUSSLEGENDRE_Y[i];
+ sum += GAUSSLEGENDRE_W[i] * Math.exp(a1 * (Math.log(t) - lnmu) + b1 * (Math.log1p(-t) - lnmuc));
+ }
+ double ans = sum * (xu - x) * Math.exp(a1 * lnmu - GammaDistribution.logGamma(alpha) + b1 * lnmuc - GammaDistribution.logGamma(b1) + GammaDistribution.logGamma(alpha + beta));
+ return ans > 0 ? 1.0 - ans : -ans;
+ }
+
+ /**
+ * Compute quantile (inverse cdf) for Beta distributions.
+ *
+ * @param p Probability
+ * @param alpha Shape parameter a
+ * @param beta Shape parameter b
+ * @return Probit for Beta distribution
+ */
+ public static double quantile(double p, double alpha, double beta) {
+ // Valid parameters
+ if(Double.isNaN(alpha) || Double.isNaN(beta) || Double.isNaN(p) || alpha < 0. || beta < 0.) {
+ return Double.NaN;
+ }
+ if(p < 0 || p > 1) {
+ return Double.NaN;
+ }
+ if(p == 0) {
+ return 0.0;
+ }
+ if(p == 1) {
+ return 1.0;
+ }
+ // Simpler to compute inverse?
+ if(p > 0.5) {
+ return 1 - rawQuantile(1 - p, beta, alpha, logBeta(beta, alpha));
+ }
+ else {
+ return rawQuantile(p, alpha, beta, logBeta(alpha, beta));
+ }
+ }
+
+ /**
+ * Raw quantile function
+ *
+ * @param p P, must be 0 < p <= .5
+ * @param alpha Alpha
+ * @param beta Beta
+ * @param logbeta log Beta(alpha, beta)
+ * @return Position
+ */
+ protected static double rawQuantile(double p, double alpha, double beta, final double logbeta) {
+ // Initial estimate for x
+ double x;
+ {
+ // Very fast approximation of y.
+ double tmp = Math.sqrt(-2 * Math.log(p));
+ double y = tmp - (2.30753 + 0.27061 * tmp) / (1. + (0.99229 + 0.04481 * tmp) * tmp);
+
+ if(alpha > 1 && beta > 1) {
+ double r = (y * y - 3.) / 6.;
+ double s = 1. / (alpha + alpha - 1.);
+ double t = 1. / (beta + beta - 1.);
+ double h = 2. / (s + t);
+ double w = y * Math.sqrt(h + r) / h - (t - s) * (r + 5. / 6. - 2. / (3. * h));
+ x = alpha / (alpha + beta * Math.exp(w + w));
+ }
+ else {
+ double r = beta + beta;
+ double t = 1. / (9. * beta);
+ t = r * Math.pow(1. - t + y * Math.sqrt(t), 3.0);
+ if(t <= 0.) {
+ x = 1. - Math.exp((Math.log1p(-p) + Math.log(beta) + logbeta) / beta);
+ }
+ else {
+ t = (4. * alpha + r - 2.) / t;
+ if(t <= 1.) {
+ x = Math.exp((Math.log(p * alpha) + logbeta) / alpha);
+ }
+ else {
+ x = 1. - 2. / (t + 1.);
+ }
+ }
+ }
+ // Degenerate initial approximations
+ if(x < 3e-308 || x > 1 - 2.22e-16) {
+ x = 0.5;
+ }
+ }
+
+ // Newon-Raphson method using the CDF
+ {
+ final double ialpha = 1 - alpha;
+ final double ibeta = 1 - beta;
+
+ // Desired accuracy, as from GNU R adoption of AS 109
+ final double acu = Math.max(1e-300, Math.pow(10., -13 - 2.5 / (alpha * alpha) - .5 / (p * p)));
+ double prevstep = 0., y = 0., stepsize = 1;
+
+ for(int outer = 0; outer < 1000; outer++) {
+ // Current CDF value
+ double ynew = cdf(x, alpha, beta);
+ if(Double.isInfinite(ynew)) { // Degenerated.
+ return Double.NaN;
+ }
+ // Error gradient
+ ynew = (ynew - p) * Math.exp(logbeta + ialpha * Math.log(x) + ibeta * Math.log1p(-x));
+ if(ynew * y <= 0.) {
+ prevstep = Math.max(Math.abs(stepsize), 3e-308);
+ }
+ // Inner loop: try different step sizes: y * 3^-i
+ double g = 1, xnew = 0.;
+ for(int inner = 0; inner < 1000; inner++) {
+ stepsize = g * ynew;
+ if(Math.abs(stepsize) < prevstep) {
+ xnew = x - stepsize; // Candidate x
+ if(xnew >= 0. && xnew <= 1.) {
+ // Close enough
+ if(prevstep <= acu || Math.abs(ynew) <= acu) {
+ return x;
+ }
+ if(xnew != 0. && xnew != 1.) {
+ break;
+ }
+ }
+ }
+ g /= 3.;
+ }
+ // Convergence
+ if(Math.abs(xnew - x) < 1e-15 * x) {
+ return x;
+ }
+ // Iterate with new values
+ x = xnew;
+ y = ynew;
+ }
+ }
+ // Not converged in Newton-Raphson
+ throw new AbortException("Beta quantile computation did not converge.");
+ }
+}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/ChiDistribution.java b/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/ChiDistribution.java index c561c4cd..84c86e98 100644 --- a/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/ChiDistribution.java +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/ChiDistribution.java @@ -1,5 +1,7 @@ package de.lmu.ifi.dbs.elki.math.statistics.distribution; +import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages; + /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures @@ -27,8 +29,10 @@ package de.lmu.ifi.dbs.elki.math.statistics.distribution; * Chi distribution. * * @author Erich Schubert + * + * @apiviz.composedOf ChiSquaredDistribution */ -public class ChiDistribution implements Distribution { +public class ChiDistribution implements DistributionWithRandom { /** * Degrees of freedom. Usually integer. */ @@ -89,4 +93,15 @@ public class ChiDistribution implements Distribution { public static double cdf(double val, double dof) { return GammaDistribution.regularizedGammaP(dof / 2, val * val / 2); } + + // FIXME: implement! + @Override + public double quantile(double val) { + throw new UnsupportedOperationException(ExceptionMessages.UNSUPPORTED_NOT_YET); + } + + @Override + public String toString() { + return "ChiDistribution(dof=" + dof + ")"; + } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/ChiSquaredDistribution.java b/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/ChiSquaredDistribution.java index 0ab39c78..8555afd3 100644 --- a/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/ChiSquaredDistribution.java +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/ChiSquaredDistribution.java @@ -1,5 +1,7 @@ package de.lmu.ifi.dbs.elki.math.statistics.distribution; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; + /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures @@ -35,7 +37,7 @@ public class ChiSquaredDistribution extends GammaDistribution { * @param dof Degrees of freedom. */ public ChiSquaredDistribution(double dof) { - super(.5 * dof, 2.0); + super(.5 * dof, .5); } /** @@ -69,4 +71,23 @@ public class ChiSquaredDistribution extends GammaDistribution { } return Math.exp((k - 1.0) * Math.log(x * 2.0) - x * 2.0 - logGamma(k)) * 2.0; } + + /** + * Return the quantile function for this distribution + * + * Reference: + * <p> + * Algorithm AS 91: The percentage points of the $\chi$^2 distribution<br /> + * D.J. Best, D. E. Roberts<br /> + * Journal of the Royal Statistical Society. Series C (Applied Statistics) + * </p> + * + * @param x Quantile + * @param dof Degrees of freedom + * @return quantile position + */ + @Reference(title = "Algorithm AS 91: The percentage points of the $\\chi^2$ distribution", authors = "D.J. Best, D. E. Roberts", booktitle = "Journal of the Royal Statistical Society. Series C (Applied Statistics)") + public static double quantile(double x, double dof) { + return GammaDistribution.quantile(x, .5 * dof, .5); + } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/ConstantDistribution.java b/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/ConstantDistribution.java index 1f36dd4a..b046f0ef 100644 --- a/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/ConstantDistribution.java +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/ConstantDistribution.java @@ -28,7 +28,7 @@ package de.lmu.ifi.dbs.elki.math.statistics.distribution; * * @author Erich Schubert */ -public class ConstantDistribution implements Distribution { +public class ConstantDistribution implements DistributionWithRandom { /** * The constant */ @@ -58,4 +58,9 @@ public class ConstantDistribution implements Distribution { public double cdf(double val) { return (val >= c) ? 1.0 : 0.0; } + + @Override + public double quantile(double val) { + return c; + } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/Distribution.java b/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/Distribution.java index 290e6434..5b6cd286 100644 --- a/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/Distribution.java +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/Distribution.java @@ -24,20 +24,14 @@ package de.lmu.ifi.dbs.elki.math.statistics.distribution; */ /** - * Interface for a simple distribution generator with a PDF, i.e. it can also - * compute a density + * Statistical distributions, with their common functions. See + * {@link DistributionWithRandom} for distributions that also have a random + * generator included. * * @author Erich Schubert */ public interface Distribution { /** - * Generate a new random value - * - * @return new random value - */ - public double nextRandom(); - - /** * Return the density of an existing value * * @param val existing value @@ -54,6 +48,14 @@ public interface Distribution { public double cdf(double val); /** + * Quantile aka probit (for normal) aka inverse CDF (invcdf, cdf^-1) function. + * + * @param val Quantile to find + * @return Quantile position + */ + public double quantile(double val); + + /** * Describe the distribution * * @return description diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/DistributionWithRandom.java b/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/DistributionWithRandom.java new file mode 100644 index 00000000..af272528 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/DistributionWithRandom.java @@ -0,0 +1,37 @@ +package de.lmu.ifi.dbs.elki.math.statistics.distribution; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +/** + * Distribution that also has support for generating random numbers. + * + * @author Erich Schubert + */ +public interface DistributionWithRandom extends Distribution { + /** + * Generate a new random value + * + * @return new random value + */ + public double nextRandom(); +} diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/GammaDistribution.java b/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/GammaDistribution.java index 6830f25a..21eebc51 100644 --- a/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/GammaDistribution.java +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/GammaDistribution.java @@ -25,14 +25,21 @@ package de.lmu.ifi.dbs.elki.math.statistics.distribution; import java.util.Random; +import de.lmu.ifi.dbs.elki.logging.LoggingUtil; import de.lmu.ifi.dbs.elki.math.MathUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; /** * Gamma Distribution, with random generation and density functions. * * @author Erich Schubert */ -public class GammaDistribution implements Distribution { +public class GammaDistribution implements DistributionWithRandom { + /** + * Euler–Mascheroni constant + */ + public static final double EULERS_CONST = 0.5772156649015328606065120900824024; + /** * LANCZOS-Coefficients for Gamma approximation. * @@ -102,6 +109,11 @@ public class GammaDistribution implements Distribution { } @Override + public double quantile(double val) { + return quantile(val, k, theta); + } + + @Override public double nextRandom() { return nextRandom(k, theta, random); } @@ -113,7 +125,7 @@ public class GammaDistribution implements Distribution { */ @Override public String toString() { - return "Gamma Distribution (k=" + k + ", theta=" + theta + ")"; + return "GammaDistribution(k=" + k + ", theta=" + theta + ")"; } /** @@ -139,7 +151,19 @@ public class GammaDistribution implements Distribution { * @return cdf value */ public static double cdf(double val, double k, double theta) { - return regularizedGammaP(k, val / theta); + return regularizedGammaP(k, val * theta); + } + + /** + * The log CDF, static version. + * + * @param val Value + * @param k Shape k + * @param theta Theta = 1.0/Beta aka. "scaling" parameter + * @return cdf value + */ + public static double logcdf(double val, double k, double theta) { + return logregularizedGammaP(k, val * theta); } /** @@ -170,6 +194,33 @@ public class GammaDistribution implements Distribution { } /** + * Gamma distribution PDF (with 0.0 for x < 0) + * + * @param x query value + * @param k Alpha + * @param theta Theta = 1 / Beta + * @return probability density + */ + public static double logpdf(double x, double k, double theta) { + if(x < 0) { + return Double.NEGATIVE_INFINITY; + } + if(x == 0) { + if(k == 1.0) { + return Math.log(theta); + } + else { + return Double.NEGATIVE_INFINITY; + } + } + if(k == 1.0) { + return Math.log(theta) - x * theta; + } + + return Math.log(theta) + (k - 1.0) * Math.log(x * theta) - x * theta - logGamma(k); + } + + /** * Compute logGamma. * * Based loosely on "Numerical Recpies" and the work of Paul Godfrey at @@ -195,6 +246,22 @@ public class GammaDistribution implements Distribution { } /** + * Compute the regular Gamma function. + * + * Note: for numerical reasons, it is preferrable to use {@link #logGamma} + * when possible! In particular, this method just computes + * {@code Math.exp(logGamma(x))} anyway. + * + * Try to postpone the {@code Math.exp} call to preserve numeric range! + * + * @param x Position + * @return Gamma at this position + */ + public static double gamma(double x) { + return Math.exp(logGamma(x)); + } + + /** * Returns the regularized gamma function P(a, x). * * Includes the quadrature way of computing. @@ -236,6 +303,49 @@ public class GammaDistribution implements Distribution { } /** + * Returns the regularized gamma function log P(a, x). + * + * Includes the quadrature way of computing. + * + * TODO: find "the" most accurate version of this. We seem to agree with + * others for the first 10+ digits, but diverge a bit later than that. + * + * @param a Parameter a + * @param x Parameter x + * @return Gamma value + */ + public static double logregularizedGammaP(final double a, final double x) { + // Special cases + if(Double.isNaN(a) || Double.isNaN(x) || (a <= 0.0) || (x < 0.0)) { + return Double.NaN; + } + if(x == 0.0) { + return Double.NEGATIVE_INFINITY; + } + if(x >= a + 1) { + // Expected to converge faster + // FIXME: and in log? + return Math.log(1.0 - regularizedGammaQ(a, x)); + } + // Loosely following "Numerical Recipes" + double del = 1.0 / a; + double sum = del; + for(int n = 1; n < Integer.MAX_VALUE; n++) { + // compute next element in the series + del *= x / (a + n); + sum = sum + del; + if(Math.abs(del / sum) < NUM_PRECISION || sum >= Double.POSITIVE_INFINITY) { + break; + } + } + if(Double.isInfinite(sum)) { + return 0; + } + // TODO: reread numerical recipes, can we replace log(sum)? + return -x + (a * Math.log(x)) - logGamma(a) + Math.log(sum); + } + + /** * Returns the regularized gamma function Q(a, x) = 1 - P(a, x). * * Includes the continued fraction way of computing, based loosely on the book @@ -313,7 +423,7 @@ public class GammaDistribution implements Distribution { final double e1 = 1.000000000, e2 = 0.499999994, e3 = 0.166666848; final double e4 = 0.041664508, e5 = 0.008345522, e6 = 0.001353826; final double e7 = 0.000247453; - + if(k < 1.0) { // Base case, for small k final double b = 1.0 + 0.36788794412 * k; // Step 1 while(true) { @@ -360,11 +470,11 @@ public class GammaDistribution implements Distribution { /* v2 = tv2; */ v12 = tv12; } - + // double b = 0.0, c = 0.0; // double si = 0.0, q0 = 0.0; final double b, c, si, q0; - + // Simpler accept cases & parameter computation { final double t = v1 * Math.sqrt(-2.0 * Math.log(v12) / v12); @@ -373,14 +483,14 @@ public class GammaDistribution implements Distribution { if(t >= 0.0) { return (gds / theta); // Immediate acceptance } - + // Random uniform final double un = random.nextDouble(); // Squeeze acceptance if(d * un <= t * t * t) { return (gds / theta); } - + if(k != -1.0) { // Step 4. Set-up for hat case final double r = 1.0 / k; q0 = ((((((((q9 * r + q8) * r + q7) * r + q6) * r + q5) * r + q4) * r + q3) * r + q2) * r + q1) * r; @@ -425,7 +535,7 @@ public class GammaDistribution implements Distribution { } } } - + // Double exponential deviate t while(true) { double e, u, sign_u, t; @@ -438,7 +548,7 @@ public class GammaDistribution implements Distribution { t = b + (e * si) * sign_u; } while(t <= -0.71874483771719); - + // New v(t) and q(t) final double v = t / (s + s); final double q; @@ -467,4 +577,370 @@ public class GammaDistribution implements Distribution { } } } + + /** + * Approximate probit for chi squared distribution + * + * Based on first half of algorithm AS 91 + * + * Reference: + * <p> + * Algorithm AS 91: The percentage points of the $\chi$ 2 distribution<br /> + * D.J. Best, D. E. Roberts<br /> + * Journal of the Royal Statistical Society. Series C (Applied Statistics) + * </p> + * + * @param p Probit value + * @param nu Shape parameter for Chi, nu = 2 * k + * @param g log(nu) + * @return Probit for chi squared + */ + @Reference(title = "Algorithm AS 91: The percentage points of the $\\chi^2$ distribution", authors = "D.J. Best, D. E. Roberts", booktitle = "Journal of the Royal Statistical Society. Series C (Applied Statistics)") + protected static double chisquaredProbitApproximation(final double p, double nu, double g) { + final double EPS1 = 1e-2; // Approximation quality + // Sanity checks + if(Double.isNaN(p) || Double.isNaN(nu)) { + return Double.NaN; + } + // Range check + if(p <= 0) { + return 0; + } + if(p >= 1) { + return Double.POSITIVE_INFINITY; + } + // Invalid parameters + if(nu <= 0) { + return Double.NaN; + } + // Shape of gamma distribution, "XX" in AS 91 + final double k = 0.5 * nu; + + // For small chi squared values - AS 91 + final double logp = Math.log(p); + if(nu < -1.24 * logp) { + // FIXME: implement and use logGammap1 instead - more stable? + // + // final double lgam1pa = (alpha < 0.5) ? logGammap1(alpha) : + // (Math.log(alpha) + g); + // return Math.exp((lgam1pa + logp) / alpha + MathUtil.LOG2); + // This is literal AS 91, above is the GNU R variant. + return Math.pow(p * k * Math.exp(g + k * MathUtil.LOG2), 1 / k); + } + else if(nu > 0.32) { + // Wilson and Hilferty estimate: - AS 91 at 3 + final double x = NormalDistribution.quantile(p, 0, 1); + final double p1 = 2. / (9. * nu); + double ch = nu * Math.pow(x * Math.sqrt(p1) + 1 - p1, 3); + + // Better approximation for p tending to 1: + if(ch > 2.2 * nu + 6) { + ch = -2 * (Math.log1p(-p) - (k - 1) * Math.log(0.5 * ch) + g); + } + return ch; + } + else { + // nu <= 0.32, AS 91 at 1 + final double C7 = 4.67, C8 = 6.66, C9 = 6.73, C10 = 13.32; + final double ag = Math.log1p(-p) + g + (k - 1) * MathUtil.LOG2; + double ch = 0.4; + while(true) { + final double p1 = 1 + ch * (C7 + ch); + final double p2 = ch * (C9 + ch * (C8 + ch)); + final double t = -0.5 + (C7 + 2 * ch) / p1 - (C9 + ch * (C10 + 3 * ch)) / p2; + final double delta = (1 - Math.exp(ag + 0.5 * ch) * p2 / p1) / t; + ch -= delta; + if(Math.abs(delta) > EPS1 * Math.abs(ch)) { + return ch; + } + } + } + } + + /** + * Compute probit (inverse cdf) for Gamma distributions. + * + * Based on algorithm AS 91: + * + * Reference: + * <p> + * Algorithm AS 91: The percentage points of the $\chi$^2 distribution<br /> + * D.J. Best, D. E. Roberts<br /> + * Journal of the Royal Statistical Society. Series C (Applied Statistics) + * </p> + * + * @param p Probability + * @param k k, alpha aka. "shape" parameter + * @param theta Theta = 1.0/Beta aka. "scaling" parameter + * @return Probit for Gamma distribution + */ + @Reference(title = "Algorithm AS 91: The percentage points of the $\\chi$^2 distribution", authors = "D.J. Best, D. E. Roberts", booktitle = "Journal of the Royal Statistical Society. Series C (Applied Statistics)") + public static double quantile(double p, double k, double theta) { + final double EPS2 = 5e-7; // final precision of AS 91 + final int MAXIT = 1000; + + // Avoid degenerates + if(Double.isNaN(p) || Double.isNaN(k) || Double.isNaN(theta)) { + return Double.NaN; + } + // Range check + if(p <= 0) { + return 0; + } + if(p >= 1) { + return Double.POSITIVE_INFINITY; + } + // Shape parameter check + if(k < 0 || theta <= 0) { + return Double.NaN; + } + // Corner case - all at 0 + if(k == 0) { + return 0.; + } + + int max_newton_iterations = 1; + // For small values, ensure some refinement iterations + if(k < 1e-10) { + max_newton_iterations = 7; + } + + final double g = logGamma(k); // == logGamma(v/2) + + // Phase I, an initial rough approximation + // First half of AS 91 + double ch = chisquaredProbitApproximation(p, 2 * k, g); + // Second hald of AS 91 follows: + // Refine ChiSquared approximation + chisq: { + if(Double.isInfinite(ch)) { + // Cannot refine infinity + max_newton_iterations = 0; + break chisq; + } + if(ch < EPS2) { + // Do not iterate, but refine with newton method + max_newton_iterations = 20; + break chisq; + } + if(p > 1 - 1e-14 || p < 1e-100) { + // Not in appropriate value range for AS 91 + max_newton_iterations = 20; + break chisq; + } + + // Phase II: Iteration + final double c = k - 1; + final double ch0 = ch; // backup initial approximation + for(int i = 1; i <= MAXIT; i++) { + final double q = ch; // previous approximation + final double p1 = 0.5 * ch; + final double p2 = p - regularizedGammaP(k, p1); + if(Double.isInfinite(p2) || ch <= 0) { + ch = ch0; + max_newton_iterations = 27; + break chisq; + } + { // Taylor series of AS 91: iteration via "goto 4" + final double t = p2 * Math.exp(k * MathUtil.LOG2 + g + p1 - c * Math.log(ch)); + final double b = t / ch; + final double a = 0.5 * t - b * c; + final double s1 = (210. + a * (140. + a * (105. + a * (84. + a * (70. + 60. * a))))) / 420.; + final double s2 = (420. + a * (735. + a * (966. + a * (1141. + 1278 * a)))) / 2520.; + final double s3 = (210. + a * (462. + a * (707. + 932. * a))) / 2520.; + final double s4 = (252. + a * (672. + 1182. * a) + c * (294. + a * (889. + 1740. * a))) / 5040.; + final double s5 = (84. + 2264. * a + c * (1175. + 606. * a)) / 2520.; + final double s6 = (120. + c * (346. + 127. * c)) / 5040.; + ch += t * (1 + 0.5 * t * s1 - b * c * (s1 - b * (s2 - b * (s3 - b * (s4 - b * (s5 - b * s6)))))); + } + if(Math.abs(q - ch) < EPS2 * ch) { + break chisq; + } + // Divergence treatment, from GNU R + if(Math.abs(q - ch) > 0.1 * Math.abs(ch)) { + ch = ((ch < q) ? 0.9 : 1.1) * q; + } + } + LoggingUtil.warning("No convergence in AS 91 Gamma probit."); + // no convergence in MAXIT iterations -- but we add Newton now... + } + double x = 0.5 * ch / theta; + if(max_newton_iterations > 0) { + // Refine result using final Newton steps. + // TODO: add unit tests that show an improvement! Maybe in logscale only? + x = gammaQuantileNewtonRefinement(Math.log(p), k, theta, max_newton_iterations, x); + } + return x; + } + + /** + * Refinement of ChiSquared probit using Newton iterations. + * + * A trick used by GNU R to improve precision. + * + * @param logpt Target value of log p + * @param k Alpha + * @param theta Theta = 1 / Beta + * @param maxit Maximum number of iterations to do + * @param x Initial estimate + * @return Refined value + */ + protected static double gammaQuantileNewtonRefinement(final double logpt, final double k, final double theta, final int maxit, double x) { + final double EPS_N = 1e-15; // Precision threshold + // 0 is not possible, try MIN_NORMAL instead + if(x <= 0) { + x = Double.MIN_NORMAL; + } + // Current estimation + double logpc = logcdf(x, k, theta); + if(x == Double.MIN_NORMAL && logpc > logpt * (1. + 1e-7)) { + return 0.; + } + if(logpc == Double.NEGATIVE_INFINITY) { + return 0.; + } + // Refine by newton iterations + for(int i = 0; i < maxit; i++) { + // Error of current approximation + final double logpe = logpc - logpt; + if(Math.abs(logpe) < Math.abs(EPS_N * logpt)) { + break; + } + // Step size is controlled by PDF: + final double g = logpdf(x, k, theta); + if(g == Double.NEGATIVE_INFINITY) { + break; + } + final double newx = x - logpe * Math.exp(logpc - g); + // New estimate: + logpc = logcdf(newx, k, theta); + if(Math.abs(logpc - logpt) > Math.abs(logpe) || (i > 0 && Math.abs(logpc - logpt) == Math.abs(logpe))) { + // no further improvement + break; + } + x = newx; + } + return x; + } + + /** + * Compute the Psi / Digamma function + * + * Reference: + * <p> + * J. M. Bernando<br /> + * Algorithm AS 103: Psi (Digamma) Function<br /> + * Statistical Algorithms + * </p> + * + * TODO: is there a more accurate version maybe in R? + * + * @param x Position + * @return digamma value + */ + @Reference(authors = "J. M. Bernando", title = "Algorithm AS 103: Psi (Digamma) Function", booktitle = "Statistical Algorithms") + public static double digamma(double x) { + if(!(x > 0)) { + return Double.NaN; + } + // Method of equation 5: + if(x <= 1e-5) { + return -EULERS_CONST - 1. / x; + } + // Method of equation 4: + else if(x > 49.) { + final double ix2 = 1. / (x * x); + // Partial series expansion + return Math.log(x) - 0.5 / x - ix2 * ((1.0 / 12.) + ix2 * (1.0 / 120. - ix2 / 252.)); + // + O(x^8) error + } + else { + // Stirling expansion + return digamma(x + 1.) - 1. / x; + } + } + + /** + * Compute the Trigamma function. Based on digamma. + * + * TODO: is there a more accurate version maybe in R? + * + * @param x Position + * @return trigamma value + */ + public static double trigamma(double x) { + if(!(x > 0)) { + return Double.NaN; + } + // Method of equation 5: + if(x <= 1e-5) { + return 1. / (x * x); + } + // Method of equation 4: + else if(x > 49.) { + final double ix2 = 1. / (x * x); + // Partial series expansion + return 1 / x - ix2 / 2. + ix2 / x * (1.0 / 6. - ix2 * (1.0 / 30. + ix2 / 42.)); + // + O(x^8) error + } + else { + // Stirling expansion + return trigamma(x + 1.) - 1. / (x * x); + } + } + + /** + * Mean least squares estimation of Gamma distribution to a set of + * observations. + * + * @param data Data + * @return Assumed distribution + */ + public static GammaDistribution estimate(double[] data) { + return estimate(data, data.length); + } + + /** + * Mean least squares estimation of Gamma distribution to a set of + * observations. + * + * Reference: + * <p> + * Maximum likelihood estimation of the parameters of the gamma distribution + * and their bias<br /> + * S. C. Choi, R. Wette<br /> + * in: Technometrics + * </p> + * + * @param data Data + * @param len Length of array + * @return Assumed distribution + */ + @Reference(title = "Maximum likelihood estimation of the parameters of the gamma distribution and their bias", authors = "S. C. Choi, R. Wette", booktitle = "Technometrics", url = "http://www.jstor.org/stable/10.2307/1266892") + public static GammaDistribution estimate(double[] data, int len) { + double meanx = 0, meanlogx = 0; + for(int i = 0; i < len; i++) { + final double logx = Math.log(data[i]); + final double deltax = data[i] - meanx; + final double deltalogx = logx - meanlogx; + meanx += deltax / (i + 1.); + meanlogx += deltalogx / (i + 1.); + } + // Initial approximation + final double logmeanx = Math.log(meanx); + final double diff = logmeanx - meanlogx; + double k = (3 - diff + Math.sqrt((diff - 3) * (diff - 3) + 24 * diff)) / (12 * diff); + + // Refine via newton iteration, based on Choi and Wette equation + while(true) { + double kdelta = (Math.log(k) - digamma(k) - diff) / (1 / k - trigamma(k)); + if(Math.abs(kdelta) < 1E-8 || Double.isNaN(kdelta)) { + break; + } + k += kdelta; + } + // Estimate theta: + final double theta = k / meanx; + return new GammaDistribution(k, theta); + } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/NormalDistribution.java b/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/NormalDistribution.java index 919cc2e3..9180b59e 100644 --- a/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/NormalDistribution.java +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/NormalDistribution.java @@ -32,7 +32,7 @@ import de.lmu.ifi.dbs.elki.math.MathUtil; * * @author Erich Schubert */ -public class NormalDistribution implements Distribution { +public class NormalDistribution implements DistributionWithRandom { /** * Coefficients for erf approximation. * @@ -148,20 +148,25 @@ public class NormalDistribution implements Distribution { public double pdf(double val) { return pdf(val, mean, stddev); } - + @Override public double cdf(double val) { return cdf(val, mean, stddev); } @Override + public double quantile(double q) { + return quantile(q, mean, stddev); + } + + @Override public double nextRandom() { return mean + random.nextGaussian() * stddev; } @Override public String toString() { - return "Normal Distribution (mean="+mean+", stddev="+stddev+")"; + return "NormalDistribution(mean=" + mean + ", stddev=" + stddev + ")"; } /** @@ -195,7 +200,7 @@ public class NormalDistribution implements Distribution { if(Double.isInfinite(x)) { return (x < 0.0) ? 2 : 0; } - + double result = Double.NaN; double absx = Math.abs(x); // First approximation interval @@ -249,7 +254,7 @@ public class NormalDistribution implements Distribution { * @return erfinv(x) */ public static double erfinv(double x) { - return standardNormalProbit(0.5 * (x + 1)) / MathUtil.SQRT2; + return standardNormalQuantile(0.5 * (x + 1)) / MathUtil.SQRT2; } /** @@ -261,10 +266,13 @@ public class NormalDistribution implements Distribution { * by Peter John Acklam * </p> * + * FIXME: precision of this seems to be rather low, compared to our other + * functions. Only about 8-9 digits agree with SciPy/GNU R. + * * @param d Quantile. Must be in [0:1], obviously. * @return Inverse erf. */ - public static double standardNormalProbit(double d) { + public static double standardNormalQuantile(double d) { if(d == 0) { return Double.NEGATIVE_INFINITY; } @@ -319,7 +327,7 @@ public class NormalDistribution implements Distribution { * @return The CDF of the normal given distribution at x. */ public static double cdf(double x, double mu, double sigma) { - return (1 + erf(x / Math.sqrt(2))) / 2; + return .5 * (1 + erf((x - mu) / (MathUtil.SQRT2 * sigma))); } /** @@ -331,7 +339,7 @@ public class NormalDistribution implements Distribution { * @param sigma Standard deviation. * @return The probit of the normal given distribution at x. */ - public static double probit(double x, double mu, double sigma) { - return mu + sigma * standardNormalProbit(x); + public static double quantile(double x, double mu, double sigma) { + return mu + sigma * standardNormalQuantile(x); } } diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/PoissonDistribution.java b/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/PoissonDistribution.java new file mode 100644 index 00000000..53fb0dc8 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/PoissonDistribution.java @@ -0,0 +1,411 @@ +package de.lmu.ifi.dbs.elki.math.statistics.distribution; + +import de.lmu.ifi.dbs.elki.math.MathUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +/** + * INCOMPLETE implementation of the poisson distribution. + * + * TODO: continue implementing, CDF, invcdf and nextRandom are missing + * + * References: + * <p> + * Catherine Loader<br /> + * Fast and Accurate Computation of Binomial Probabilities. + * </p> + * + * @author Erich Schubert + */ +public class PoissonDistribution implements Distribution { + /** + * Number of tries + */ + private int n; + + /** + * Success probability + */ + private double p; + + /** Stirling error constants: 1./12 */ + private final static double S0 = 0.08333333333333333333333d; + + /** Stirling error constants: 1./360 */ + private final static double S1 = 0.0027777777777777777777777777778d; + + /** Stirling error constants: 1./1260 */ + private final static double S2 = 0.00079365079365079365079365d; + + /** Stirling error constants: 1./1680 */ + private final static double S3 = 0.000595238095238095238095238095d; + + /** Stirling error constants: 1./1188 */ + private final static double S4 = 0.00084175084175084175084175084175d; + + /** + * Exact table values for n <= 15 in steps of 0.5 + * + * sfe[n] = ln( (n!*e^n)/((n^n)*sqrt(2*pi*n)) ) + */ + private final static double STIRLING_EXACT_ERROR[] = {// + 0.0, // 0.0 + 0.1534264097200273452913848, // 0.5 + 0.0810614667953272582196702, // 1.0 + 0.0548141210519176538961390, // 1.5 + 0.0413406959554092940938221, // 2.0 + 0.03316287351993628748511048, // 2.5 + 0.02767792568499833914878929, // 3.0 + 0.02374616365629749597132920, // 3.5 + 0.02079067210376509311152277, // 4.0 + 0.01848845053267318523077934, // 4.5 + 0.01664469118982119216319487, // 5.0 + 0.01513497322191737887351255, // 5.5 + 0.01387612882307074799874573, // 6.0 + 0.01281046524292022692424986, // 6.5 + 0.01189670994589177009505572, // 7.0 + 0.01110455975820691732662991, // 7.5 + 0.010411265261972096497478567, // 8.0 + 0.009799416126158803298389475, // 8.5 + 0.009255462182712732917728637, // 9.0 + 0.008768700134139385462952823, // 9.5 + 0.008330563433362871256469318, // 10.0 + 0.007934114564314020547248100, // 10.5 + 0.007573675487951840794972024, // 11.0 + 0.007244554301320383179543912, // 11.5 + 0.006942840107209529865664152, // 12.0 + 0.006665247032707682442354394, // 12.5 + 0.006408994188004207068439631, // 13.0 + 0.006171712263039457647532867, // 13.5 + 0.005951370112758847735624416, // 14.0 + 0.005746216513010115682023589, // 14.5 + 0.0055547335519628013710386900 // 15.0 + }; + + /** + * Constructor. + * + * Private: API not yet completely implemented! + * + * @param n Number of tries + * @param p Success probability + */ + public PoissonDistribution(int n, double p) { + super(); + this.n = n; + this.p = p; + } + + /** + * Poisson PMF for integer values. + * + * @param x integer values + * @return Probability + */ + @Reference(title = "Fast and accurate computation of binomial probabilities", authors = "C. Loader", booktitle = "", url = "http://projects.scipy.org/scipy/raw-attachment/ticket/620/loader2000Fast.pdf") + public double pmf(int x) { + // Invalid values + if(x < 0 || x > n) { + return 0.0; + } + // Extreme probabilities + if(p <= 0d) { + return x == 0 ? 1.0 : 0.0; + } + if(p >= 1d) { + return x == n ? 1.0 : 0.0; + } + // Extreme values of x + if(x == 0) { + if(p < 0.1) { + return Math.exp(-devianceTerm(n, n * (1.0 - p)) - n * p); + } + else { + return Math.exp(n * Math.log(1.0 - p)); + } + } + if(x == n) { + if(p > 0.9) { + return Math.exp(-devianceTerm(n, n * p) - n * (1 - p)); + } + else { + return Math.exp(n * Math.log(p)); + } + } + + final double lc = stirlingError(n) - stirlingError(x) - stirlingError(n - x) - devianceTerm(x, n * p) - devianceTerm(n - x, n * (1.0 - p)); + final double f = (MathUtil.TWOPI * x * (n - x)) / n; + return Math.exp(lc) / Math.sqrt(f); + } + + @Override + @Reference(title = "Fast and accurate computation of binomial probabilities", authors = "C. Loader", booktitle = "", url = "http://projects.scipy.org/scipy/raw-attachment/ticket/620/loader2000Fast.pdf") + public double pdf(double x) { + // Invalid values + if(x < 0 || x > n) { + return 0.0; + } + // Extreme probabilities + if(p <= 0d) { + return x == 0 ? 1.0 : 0.0; + } + if(p >= 1d) { + return x == n ? 1.0 : 0.0; + } + final double q = 1 - p; + // FIXME: check for x to be integer, return 0 otherwise? + + // Extreme values of x + if(x == 0) { + if(p < 0.1) { + return Math.exp(-devianceTerm(n, n * q) - n * p); + } + else { + return Math.exp(n * Math.log(q)); + } + } + if(x == n) { + if(p > 0.9) { + return Math.exp(-devianceTerm(n, n * p) - n * q); + } + else { + return Math.exp(n * Math.log(p)); + } + } + final double lc = stirlingError(n) - stirlingError(x) - stirlingError(n - x) - devianceTerm(x, n * p) - devianceTerm(n - x, n * q); + final double f = (MathUtil.TWOPI * x * (n - x)) / n; + return Math.exp(lc) / Math.sqrt(f); + } + + // FIXME: implement! + @Override + public double cdf(double val) { + throw new AbortException(ExceptionMessages.UNSUPPORTED_NOT_YET); + } + + // FIXME: implement! + @Override + public double quantile(double val) { + throw new AbortException(ExceptionMessages.UNSUPPORTED_NOT_YET); + } + + /** + * Compute the poisson distribution PDF with an offset of + 1 + * + * pdf(x_plus_1 - 1, lambda) + * + * @param x_plus_1 x+1 + * @param lambda Lambda + * @return pdf + */ + public static double poissonPDFm1(double x_plus_1, double lambda) { + if(Double.isInfinite(lambda)) { + return 0.; + } + if(x_plus_1 > 1) { + return rawProbability(x_plus_1 - 1, lambda); + } + if(lambda > Math.abs(x_plus_1 - 1) * MathUtil.LOG2 * Double.MAX_EXPONENT / 1e-14) { + return Math.exp(-lambda - GammaDistribution.logGamma(x_plus_1)); + } + else { + return rawProbability(x_plus_1, lambda) * (x_plus_1 / lambda); + } + } + + /** + * Compute the poisson distribution PDF with an offset of + 1 + * + * log pdf(x_plus_1 - 1, lambda) + * + * @param x_plus_1 x+1 + * @param lambda Lambda + * @return pdf + */ + public static double logpoissonPDFm1(double x_plus_1, double lambda) { + if(Double.isInfinite(lambda)) { + return Double.NEGATIVE_INFINITY; + } + if(x_plus_1 > 1) { + return rawLogProbability(x_plus_1 - 1, lambda); + } + if(lambda > Math.abs(x_plus_1 - 1) * MathUtil.LOG2 * Double.MAX_EXPONENT / 1e-14) { + return -lambda - GammaDistribution.logGamma(x_plus_1); + } + else { + return rawLogProbability(x_plus_1, lambda) + Math.log(x_plus_1 / lambda); + } + } + + /** + * Calculates the Striling Error + * + * stirlerr(n) = ln(n!) - ln(sqrt(2*pi*n)*(n/e)^n) + * + * @param n Parameter n + * @return Stirling error + */ + @Reference(title = "Fast and accurate computation of binomial probabilities", authors = "C. Loader", booktitle = "", url = "http://projects.scipy.org/scipy/raw-attachment/ticket/620/loader2000Fast.pdf") + private static double stirlingError(int n) { + // Try to use a table value: + if(n < 16) { + return STIRLING_EXACT_ERROR[n * 2]; + } + final double nn = n * n; + // Use the appropriate number of terms + if(n > 500) { + return (S0 - S1 / nn) / n; + } + if(n > 80) { + return ((S0 - (S1 - S2 / nn)) / nn) / n; + } + if(n > 35) { + return ((S0 - (S1 - (S2 - S3 / nn) / nn) / nn) / n); + } + return ((S0 - (S1 - (S2 - (S3 - S4 / nn) / nn) / nn) / nn) / n); + } + + /** + * Calculates the Striling Error + * + * stirlerr(n) = ln(n!) - ln(sqrt(2*pi*n)*(n/e)^n) + * + * @param n Parameter n + * @return Stirling error + */ + @Reference(title = "Fast and accurate computation of binomial probabilities", authors = "C. Loader", booktitle = "", url = "http://projects.scipy.org/scipy/raw-attachment/ticket/620/loader2000Fast.pdf") + private static double stirlingError(double n) { + if(n < 16.0) { + // Our table has a step size of 0.5 + final double n2 = 2.0 * n; + if(Math.floor(n2) == n2) { // Exact match + return STIRLING_EXACT_ERROR[(int) n2]; + } + else { + return GammaDistribution.logGamma(n + 1.0) - (n + 0.5) * Math.log(n) + n - MathUtil.LOGSQRTTWOPI; + } + } + final double nn = n * n; + if(n > 500.0) { + return (S0 - S1 / nn) / n; + } + if(n > 80.0) { + return ((S0 - (S1 - S2 / nn)) / nn) / n; + } + if(n > 35.0) { + return ((S0 - (S1 - (S2 - S3 / nn) / nn) / nn) / n); + } + return ((S0 - (S1 - (S2 - (S3 - S4 / nn) / nn) / nn) / nn) / n); + } + + /** + * Evaluate the deviance term of the saddle point approximation. + * + * bd0(x,np) = x*ln(x/np)+np-x + * + * @param x probability density function position + * @param np product of trials and success probability: n*p + * @return Deviance term + */ + @Reference(title = "Fast and accurate computation of binomial probabilities", authors = "C. Loader", booktitle = "", url = "http://projects.scipy.org/scipy/raw-attachment/ticket/620/loader2000Fast.pdf") + private static double devianceTerm(double x, double np) { + if(Math.abs(x - np) < 0.1 * (x + np)) { + final double v = (x - np) / (x + np); + + double s = (x - np) * v; + double ej = 2.0d * x * v; + for(int j = 1;; j++) { + ej *= v * v; + final double s1 = s + ej / (2 * j + 1); + if(s1 == s) { + return s1; + } + s = s1; + } + } + return x * Math.log(x / np) + np - x; + } + + /** + * Poisson distribution probability, but also for non-integer arguments. + * + * lb^x exp(-lb) / x! + * + * @param x X + * @param lambda lambda + * @return Poisson distribution probability + */ + public static double rawProbability(double x, double lambda) { + // Extreme lambda + if(lambda == 0) { + return ((x == 0) ? 1. : 0.); + } + // Extreme values + if(Double.isInfinite(lambda) || x < 0) { + return 0.; + } + if(x <= lambda * Double.MIN_NORMAL) { + return Math.exp(-lambda); + } + if(lambda < x * Double.MIN_NORMAL) { + double r = -lambda + x * Math.log(lambda) - GammaDistribution.logGamma(x + 1); + return Math.exp(r); + } + final double f = MathUtil.TWOPI * x; + final double y = -stirlingError(x) - devianceTerm(x, lambda); + return Math.exp(y) / Math.sqrt(f); + } + + /** + * Poisson distribution probability, but also for non-integer arguments. + * + * lb^x exp(-lb) / x! + * + * @param x X + * @param lambda lambda + * @return Poisson distribution probability + */ + public static double rawLogProbability(double x, double lambda) { + // Extreme lambda + if(lambda == 0) { + return ((x == 0) ? 1. : Double.NEGATIVE_INFINITY); + } + // Extreme values + if(Double.isInfinite(lambda) || x < 0) { + return Double.NEGATIVE_INFINITY; + } + if(x <= lambda * Double.MIN_NORMAL) { + return -lambda; + } + if(lambda < x * Double.MIN_NORMAL) { + return -lambda + x * Math.log(lambda) - GammaDistribution.logGamma(x + 1); + } + final double f = MathUtil.TWOPI * x; + final double y = -stirlingError(x) - devianceTerm(x, lambda); + return -0.5 * Math.log(f) + y; + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/StudentsTDistribution.java b/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/StudentsTDistribution.java new file mode 100644 index 00000000..2e9e0d15 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/StudentsTDistribution.java @@ -0,0 +1,90 @@ +package de.lmu.ifi.dbs.elki.math.statistics.distribution;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages;
+
+/**
+ * Student's t distribution.
+ *
+ * FIXME: add quantile function!
+ *
+ * @author Jan Brusis
+ */
+public class StudentsTDistribution implements Distribution {
+ /**
+ * Degrees of freedom
+ */
+ private final int v;
+
+ /**
+ * Constructor.
+ *
+ * @param v Degrees of freedom
+ */
+ public StudentsTDistribution(int v) {
+ this.v = v;
+ }
+
+ @Override
+ public double pdf(double val) {
+ return pdf(val, v);
+ }
+
+ @Override
+ public double cdf(double val) {
+ return cdf(val, v);
+ }
+
+ // FIXME: implement!
+ @Override
+ public double quantile(double val) {
+ throw new AbortException(ExceptionMessages.UNSUPPORTED_NOT_YET);
+ }
+
+ /**
+ * Static version of the t distribution's PDF.
+ *
+ * @param val value to evaluate
+ * @param v degrees of freedom
+ * @return f(val,v)
+ */
+ public static double pdf(double val, int v) {
+ // TODO: improve precision by computing "exp" last?
+ return Math.exp(GammaDistribution.logGamma((v + 1) / 2) - GammaDistribution.logGamma(v / 2)) * (1 / Math.sqrt(v * Math.PI)) * Math.pow(1 + (val * val) / v, -((v + 1) / 2));
+ }
+
+ /**
+ * Static version of the CDF of the t-distribution for t > 0
+ *
+ * @param val value to evaluate
+ * @param v degrees of freedom
+ * @return F(val, v)
+ */
+ public static double cdf(double val, int v) {
+ double x = v / (val * val + v);
+ return 1 - (0.5 * BetaDistribution.regularizedIncBeta(x, v / 2, 0.5));
+ }
+}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/UniformDistribution.java b/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/UniformDistribution.java index 9571cfd3..4f54fbf9 100644 --- a/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/UniformDistribution.java +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/distribution/UniformDistribution.java @@ -30,7 +30,7 @@ import java.util.Random; * * @author Erich Schubert */ -public class UniformDistribution implements Distribution { +public class UniformDistribution implements DistributionWithRandom { /** * Minimum */ @@ -100,20 +100,20 @@ public class UniformDistribution implements Distribution { } return (val - min) / len; } + + @Override + public double quantile(double val) { + return min + len * val; + } @Override public double nextRandom() { return min + random.nextDouble() * len; } - /** - * Simple toString explaining the distribution parameters. - * - * Used in describing cluster models. - */ @Override public String toString() { - return "Uniform Distribution (min=" + min + ", max=" + max + ")"; + return "UniformDistribution(min=" + min + ", max=" + max + ")"; } /** diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/tests/GoodnessOfFitTest.java b/src/de/lmu/ifi/dbs/elki/math/statistics/tests/GoodnessOfFitTest.java new file mode 100644 index 00000000..f50f469f --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/tests/GoodnessOfFitTest.java @@ -0,0 +1,49 @@ +package de.lmu.ifi.dbs.elki.math.statistics.tests;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable;
+
+/**
+ * Interface for the statistical test used by HiCS.
+ *
+ * Provides a single method that calculates the deviation between two data
+ * samples, given as arrays of double values
+ *
+ * @author Jan Brusis
+ * @author Erich Schubert
+ */
+public interface GoodnessOfFitTest extends Parameterizable {
+ /**
+ * Measure the deviation of a full sample from a conditional sample.
+ *
+ * Sample arrays *may* be modified, e.g. sorted, by the test.
+ *
+ * @param fullSample Full sample
+ * @param conditionalSample Conditional sample
+ *
+ * @return Deviation
+ */
+ public double deviation(double[] fullSample, double[] conditionalSample);
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/tests/KolmogorovSmirnovTest.java b/src/de/lmu/ifi/dbs/elki/math/statistics/tests/KolmogorovSmirnovTest.java new file mode 100644 index 00000000..236f26b8 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/tests/KolmogorovSmirnovTest.java @@ -0,0 +1,117 @@ +package de.lmu.ifi.dbs.elki.math.statistics.tests;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.Arrays;
+
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Kolmogorov-Smirnov test.
+ *
+ * Class that tests two given real-valued data samples on whether they might
+ * have originated from the same underlying distribution using the
+ * Kolmogorov-Smirnov test statistic that compares the two empirical cumulative
+ * distribution functions. The KS statistic is defined as the maximum absolute
+ * difference of the empirical CDFs.
+ *
+ * @author Erich Schubert
+ */
+public class KolmogorovSmirnovTest implements GoodnessOfFitTest {
+ /**
+ * Static instance
+ */
+ public static KolmogorovSmirnovTest STATIC = new KolmogorovSmirnovTest();
+
+ /**
+ * Constructor.
+ */
+ public KolmogorovSmirnovTest() {
+ super();
+ }
+
+ @Override
+ public double deviation(double[] fullSample, double[] conditionalSample) {
+ Arrays.sort(fullSample);
+ Arrays.sort(conditionalSample);
+ return calculateTestStatistic(fullSample, conditionalSample);
+ }
+
+ /**
+ * Calculates the maximum distance between the two empirical CDFs of two data
+ * samples. The sample positions and CDFs must be synchronized!
+ *
+ * @param sample1 first data sample positions
+ * @param sample2 second data sample positions
+ * @return the largest distance between both functions
+ */
+ public static double calculateTestStatistic(double[] sample1, double[] sample2) {
+ double maximum = 0.0;
+
+ int index1 = 0, index2 = 0;
+ double cdf1 = 0, cdf2 = 0;
+
+ // Parallel iteration over both curves. We can stop if we reach either end,
+ // As the difference can then only decrease!
+ while(index1 < sample1.length && index2 < sample2.length) {
+ // Next (!) positions
+ final double x1 = sample1[index1], x2 = sample2[index2];
+ // Advance on first curve
+ if(x1 <= x2) {
+ index1++;
+ // Handle multiple points with same x:
+ while(index1 < sample1.length && sample1[index1] == x1) {
+ index1++;
+ }
+ cdf1 = ((double) index1) / sample1.length;
+ }
+ // Advance on second curve
+ if(x1 >= x2) {
+ index2++;
+ // Handle multiple points with same x:
+ while(index2 < sample2.length && sample2[index2] == x2) {
+ index2++;
+ }
+ cdf2 = ((double) index2) / sample2.length;
+ }
+ maximum = Math.max(maximum, Math.abs(cdf1 - cdf2));
+ }
+
+ return maximum;
+ }
+
+ /**
+ * Parameterizer, to use the static instance.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected KolmogorovSmirnovTest makeInstance() {
+ return STATIC;
+ }
+ }
+}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/tests/WelchTTest.java b/src/de/lmu/ifi/dbs/elki/math/statistics/tests/WelchTTest.java new file mode 100644 index 00000000..554a22db --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/tests/WelchTTest.java @@ -0,0 +1,124 @@ +package de.lmu.ifi.dbs.elki.math.statistics.tests;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.math.MeanVariance;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.StudentsTDistribution;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Calculates a test statistic according to Welch's t test for two samples
+ * Supplies methods for calculating the degrees of freedom according to the
+ * Welch-Satterthwaite Equation. Also directly calculates a two-sided p-value
+ * for the underlying t-distribution
+ *
+ * @author Jan Brusis
+ * @author Erich Schubert
+ *
+ * @apiviz.uses StudentsTDistribution
+ */
+public class WelchTTest implements GoodnessOfFitTest {
+ /**
+ * Static instance.
+ */
+ public static final WelchTTest STATIC = new WelchTTest();
+
+ /**
+ * Constructor.
+ */
+ public WelchTTest() {
+ super();
+ }
+
+ @Override
+ public double deviation(double[] sample1, double[] sample2) {
+ MeanVariance mv1 = new MeanVariance(), mv2 = new MeanVariance();
+ for(double d : sample1) {
+ mv1.put(d);
+ }
+ for(double d : sample2) {
+ mv2.put(d);
+ }
+
+ final double t = calculateTestStatistic(mv1, mv2);
+ final int v = calculateDOF(mv1, mv2);
+ return 1 - calculatePValue(t, v);
+ }
+
+ /**
+ * Calculate the statistic of Welch's t test using statistical moments of the
+ * provided data samples
+ *
+ * @param mv1 Mean and variance of first sample
+ * @param mv2 Mean and variance of second sample
+ * @return Welch's t statistic
+ */
+ public static double calculateTestStatistic(MeanVariance mv1, MeanVariance mv2) {
+ final double delta = mv1.getMean() - mv2.getMean();
+ final double relvar1 = mv1.getSampleVariance() / mv1.getCount();
+ final double relvar2 = mv2.getSampleVariance() / mv2.getCount();
+ return delta / Math.sqrt(relvar1 + relvar2);
+ }
+
+ /**
+ * Calculates the degree of freedom according to Welch-Satterthwaite
+ *
+ * @param mv1 Mean and variance of first sample
+ * @param mv2 Mean and variance of second sample
+ * @return Estimated degrees of freedom.
+ */
+ public static int calculateDOF(MeanVariance mv1, MeanVariance mv2) {
+ final double relvar1 = mv1.getSampleVariance() / mv1.getCount();
+ final double relvar2 = mv2.getSampleVariance() / mv2.getCount();
+ final double wvariance = relvar1 + relvar2;
+ final double div = relvar1 * relvar1 / (mv1.getCount() - 1) + relvar2 * relvar2 / (mv2.getCount() - 1);
+ return (int) (wvariance * wvariance / div);
+ }
+
+ /**
+ * Calculates the two-sided p-Value of the underlying t-Distribution with v
+ * degrees of freedom
+ *
+ * @param t Integration limit
+ * @param v Degrees of freedom
+ * @return p-Value
+ */
+ public static double calculatePValue(double t, int v) {
+ return 2 * (1 - StudentsTDistribution.cdf(Math.abs(t), v));
+ }
+
+ /**
+ * Parameterizer, to use the static instance.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected WelchTTest makeInstance() {
+ return STATIC;
+ }
+ }
+}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/tests/package-info.java b/src/de/lmu/ifi/dbs/elki/math/statistics/tests/package-info.java new file mode 100644 index 00000000..3e39a519 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/tests/package-info.java @@ -0,0 +1,26 @@ +/** + * <p>Statistical tests.</p> + */ +/* +This file is part of ELKI: +Environment for Developing KDD-Applications Supported by Index-Structures + +Copyright (C) 2012 +Ludwig-Maximilians-Universität München +Lehr- und Forschungseinheit für Datenbanksysteme +ELKI Development Team + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ +package de.lmu.ifi.dbs.elki.math.statistics.tests;
\ No newline at end of file |