package de.lmu.ifi.dbs.elki.math.statistics.tests;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
Copyright (C) 2015
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
import java.util.Arrays;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
/**
* Perform a two-sample Anderson-Darling rank test, and standardize the
* statistic according to Scholz and Stephens. Ties are handled as discussed in
* Equation 7 of Scholz and Stephens.
*
* To access the non-standardized A2 scores, use the function
* {@link #unstandardized}.
*
* Compared to the Cramer-van Mises test, the Anderson-Darling test puts more
* weight on the tail of the distribution. This variant only uses the ranks.
*
*
* References:
*
* D. A. Darling
* The Kolmogorov-Smirnov, Cramer-von Mises tests.
* Annals of Mathematical Statistics 28(4)
*
*
* A. N. Pettitt
* A two-sample Anderson-Darling rank statistic
* Biometrika 63 (1)
*
*
* F. W. Scholz, and M. A. Stephens
* K-sample Anderson–Darling tests
* Journal of the American Statistical Association, 82(399)
*
*
* @author Erich Schubert
*/
@Reference(authors = "F. W. Scholz, and M. A. Stephens", //
title = "K-sample Anderson–Darling tests", //
booktitle = "Journal of the American Statistical Association, 82(399)", //
url = "http://dx.doi.org/10.1080/01621459.1987.10478517")
public class StandardizedTwoSampleAndersonDarlingTest implements GoodnessOfFitTest {
/**
* Static instance.
*/
public static final StandardizedTwoSampleAndersonDarlingTest STATIC = new StandardizedTwoSampleAndersonDarlingTest();
/** Additional reference -- Darling's note on this equation */
@Reference(authors = "D. A. Darling", //
title = "The Kolmogorov-Smirnov, Cramer-von Mises tests", //
booktitle = "Annals of mathematical statistics 28(4)", //
url = "http://dx.doi.org/10.1214/aoms/1177706788")
public static final Void ADDITIONAL_REFERENCE_1 = null;
/** More detailed discussion by Pettitt */
@Reference(authors = "A. N. Pettitt", //
title = "A two-sample Anderson-Darling rank statistic",//
booktitle = "Biometrika 63 (1)", //
url = "http://dx.doi.org/10.1093/biomet/63.1.161")
public static final Void ADDITIONAL_REFERENCE_2 = null;
@Override
public double deviation(double[] sample1, double[] sample2) {
final double A2 = unstandardized(sample1, sample2);
final int N = sample1.length + sample2.length;
final double H = 1. / sample1.length + 1. / sample2.length;
// Compute g and h in a single pass:
double g = 0., h = 1. / (N - 1);
for(int i = N - 2; i > 0; i--) {
g += h / (N - i);
h += 1. / i;
}
final double a = 4 * g - 6 + (10 - 6 * g) * H;
final double b = 12 * g - 22 + 8 * h + (2 * g - 14 * h - 4) * H;
final double c = 36 * h + 4 + (2 * h - 6) * H;
final double d = 24;
final double N2 = N * N, N3 = N2 * N;
final double sigmasq = (a * N3 + b * N2 + c * N + d) / ((N - 1.) * (N - 2.) * (N - 3.));
return sigmasq > 0 ? (A2 - (2 - 1)) / Math.sqrt(sigmasq) : 0.;
}
/**
* K-samples version of the Anderson-Darling test.
*
* @param samples Samples
* @return A2 score
*/
public double deviation(double[][] samples) {
final int N = totalLength(samples);
final double A2 = unstandardized(samples, N);
final int k = samples.length;
double H = 0.;
for(double[] sample : samples) {
H += 1. / sample.length;
}
// Compute g and h in a single pass:
double g = 0., h = 1. / (N - 1);
for(int i = N - 2; i > 0; i--) {
g += h / (N - i);
h += 1. / i;
}
final int k2 = k * k;
final double hk = h * k;
final double a = (4 * g - 6) * (k - 1) + (10 - 6 * g) * H;
final double b = (2 * g - 4) * k2 + 8 * hk + (2 * g - 14 * h - 4) * H - 8 * h + 4 * g - 6;
final double c = (6 * h + 2 * g - 2) * k2 + (4 * h - 4 * g + 6) * k + (2 * h - 6) * H + 4 * h;
final double d = (2 * h + 6) * k2 - 4 * hk;
final double N2 = N * N, N3 = N2 * N;
final double sigmasq = (a * N3 + b * N2 + c * N + d) / ((N - 1.) * (N - 2.) * (N - 3.));
return sigmasq > 0 ? (A2 - (k - 1)) / Math.sqrt(sigmasq) : 0.;
}
/**
* Compute the non-standardized A2 test statistic for the k-samples test.
*
* @param samples Samples
* @return Test statistic
*/
public double unstandardized(double[][] samples) {
final int N = totalLength(samples);
return unstandardized(samples, N);
}
/**
* Compute the non-standardized A2 test statistic for the k-samples test.
*
* This is based on Scholz and Stephens, Equation 7.
*
* @param samples Samples
* @param N total length
* @return Test statistic
*/
private double unstandardized(double[][] samples, int N) {
final int k = samples.length;
// Sort the sample data (API allows mutation!), and join:
final double[] combined = new double[N];
{
int p = 0;
for(double[] samp : samples) {
Arrays.sort(samp);
System.arraycopy(samp, 0, combined, p, samp.length);
p += samp.length;
}
assert (p == N);
}
// Sort joined data
Arrays.sort(combined);
int[] m = new int[k];
double[] Ak = new double[k];
for(int j = 0; j < N;) {
// Find multiplicity in combined sample:
final double x = combined[j++];
int lj = 1; // Count multiplicity:
while(j < N && combined[j] == x) {
++j;
++lj;
}
// Process each sample:
for(int i = 0; i < k; i++) {
final double[] sam = samples[i];
// Multiplicity in sample:
int mi = m[i];
assert (mi >= sam.length || sam[mi] >= x);
int fi = 0; // Multiplicity
while(mi < sam.length && sam[mi] == x) {
++mi;
++fi;
}
if(fi > 0) {
assert (m[i] + fi == mi);
m[i] = mi; // Store
}
double bi = j - .5 * lj;
double v = N * (mi - .5 * fi) - sam.length * bi;
Ak[i] += lj * v * v / (bi * (N - bi) - .25 * N * lj);
}
}
double A2 = 0.;
for(int j = 0; j < k; j++) {
A2 += Ak[j] / samples[j].length;
}
A2 *= (N - 1.) / (N * N);
return A2;
}
/**
* Compute the non-standardized A2 test statistic for the k-samples test.
*
* This is based on Scholz and Stephens, Equation 7.
*
* @param sample1 First sample
* @param sample2 Second sample
* @return Test statistic
*/
public double unstandardized(double[] sample1, double[] sample2) {
final int n1 = sample1.length, n2 = sample2.length, N = n1 + n2;
// Sort the sample data (API allows mutation!), and join:
final double[] combined = new double[N];
Arrays.sort(sample1);
System.arraycopy(sample1, 0, combined, 0, n1);
Arrays.sort(sample2);
System.arraycopy(sample2, 0, combined, n1, n2);
// Sort joined data
Arrays.sort(combined);
int m1 = 0, m2 = 0; // Current position
double Ak1 = 0., Ak2 = 0.; // Aggregates
for(int j = 0; j < N;) {
// Find multiplicity in combined sample:
final double x = combined[j++];
int lj = 1; // Count multiplicity:
while(j < N && combined[j] == x) {
++j;
++lj;
}
final double bi = j - .5 * lj;
{// First sample:
assert (m1 >= n1 || sample1[m1] >= x);
int f1 = 0; // Multiplicity
while(m1 < n1 && sample1[m1] == x) {
++m1;
++f1;
}
double v = N * (m1 - .5 * f1) - n1 * bi;
Ak1 += lj * v * v / (bi * (N - bi) - .25 * N * lj);
}
{ // Second sample
assert (m2 >= n2 || sample2[m2] >= x);
int f2 = 0; // Multiplicity
while(m2 < n2 && sample2[m2] == x) {
++m2;
++f2;
}
double v = N * (m2 - .5 * f2) - n2 * bi;
Ak2 += lj * v * v / (bi * (N - bi) - .25 * N * lj);
}
}
double A2 = Ak1 / n1 + Ak2 / n2;
A2 *= (N - 1.) / (N * N);
return A2;
}
/**
* Total length of a set of Samples.
*
* @param samples Samples
* @return Sum of the lengths.
*/
private int totalLength(double[][] samples) {
int N = 0;
for(double[] samp : samples) {
N += samp.length;
}
return N;
}
}