package de.lmu.ifi.dbs.elki.application.experiments;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
Copyright (C) 2015
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Locale;
import java.util.Random;
import de.lmu.ifi.dbs.elki.application.AbstractApplication;
import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
import de.lmu.ifi.dbs.elki.math.statistics.intrinsicdimensionality.AggregatedHillEstimator;
import de.lmu.ifi.dbs.elki.math.statistics.intrinsicdimensionality.GEDEstimator;
import de.lmu.ifi.dbs.elki.math.statistics.intrinsicdimensionality.HillEstimator;
import de.lmu.ifi.dbs.elki.math.statistics.intrinsicdimensionality.IntrinsicDimensionalityEstimator;
import de.lmu.ifi.dbs.elki.math.statistics.intrinsicdimensionality.LMomentsEstimator;
import de.lmu.ifi.dbs.elki.math.statistics.intrinsicdimensionality.MOMEstimator;
import de.lmu.ifi.dbs.elki.math.statistics.intrinsicdimensionality.PWM2Estimator;
import de.lmu.ifi.dbs.elki.math.statistics.intrinsicdimensionality.PWMEstimator;
import de.lmu.ifi.dbs.elki.math.statistics.intrinsicdimensionality.RVEstimator;
import de.lmu.ifi.dbs.elki.math.statistics.intrinsicdimensionality.ZipfEstimator;
import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect;
import de.lmu.ifi.dbs.elki.utilities.exceptions.UnableToComplyException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.EnumParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
/**
* Class for testing the estimation quality of intrinsic dimensionality
* estimators.
*
* @author Erich Schubert
*/
public class EvaluateIntrinsicDimensionalityEstimators extends AbstractApplication {
/**
* Benchmark parameters.
*/
int startk = 3, maxk = 10, samples = 1000, dim = 5;
/**
* Aggregation method.
*/
Aggregate agg;
/**
* Output format parameter.
*/
OutputFormat format;
/**
* Random generator.
*/
RandomFactory rnd;
/**
* Constructor.
*
* @param startk Start value of k
* @param maxk Maximum value of k
* @param samples Number of samples
* @param dim Number of dimensions
* @param agg Aggregation method
* @param format Output format
* @param rnd Random seed.
*/
public EvaluateIntrinsicDimensionalityEstimators(int startk, int maxk, int samples, int dim, Aggregate agg, OutputFormat format, RandomFactory rnd) {
this.startk = startk;
this.maxk = maxk;
this.samples = samples;
this.dim = dim;
this.agg = agg;
this.format = format;
this.rnd = rnd;
}
@Override
public void run() throws UnableToComplyException {
ArrayList abbreviat = new ArrayList<>();
ArrayList estimators = new ArrayList<>();
// Hill estimator
abbreviat.add("Hill");
estimators.add(HillEstimator.STATIC);
// Method of Moments
abbreviat.add("MoM");
estimators.add(MOMEstimator.STATIC);
// Regularly varying functions estimator.
abbreviat.add("RV");
estimators.add(RVEstimator.STATIC);
// Aggregated hill estimator
abbreviat.add("AggHi");
estimators.add(AggregatedHillEstimator.STATIC);
// Zipf estimator (qq-estimator)
abbreviat.add("Zipf");
estimators.add(ZipfEstimator.STATIC);
// Generalized expansion dimension
abbreviat.add("GED");
estimators.add(GEDEstimator.STATIC);
// L-Moments based
abbreviat.add("LMM");
estimators.add(LMomentsEstimator.STATIC);
// Probability weighted moments, using first moment only.
abbreviat.add("PWM");
estimators.add(PWMEstimator.STATIC);
// Probability weighted moments, using second moment.
abbreviat.add("PWM2");
estimators.add(PWM2Estimator.STATIC);
PrintStream out = System.out; // TODO: add output file parameter?
final int digits = (int) Math.ceil(Math.log10(maxk + 1));
switch(format){
case TABULAR:
out.append(String.format("%" + digits + "s", "k"));
for(int i = 0; i < estimators.size(); i++) {
for(String postfix : agg.description()) {
out.format(Locale.ROOT, " %10s", abbreviat.get(i) + "-" + postfix);
}
}
out.append(FormatUtil.NEWLINE);
break;
case TSV:
out.append("k");
for(int i = 0; i < estimators.size(); i++) {
for(String postfix : agg.description()) {
out.append('\t').append(abbreviat.get(i)).append('-').append(postfix);
}
}
out.append(FormatUtil.NEWLINE);
break;
}
double[][] v = new double[estimators.size()][samples];
for(int l = startk; l <= maxk; l++) {
for(int p = 0; p < samples; p++) {
// Prefer independent samples.
double[] dists = makeSample(l);
for(int i = 0; i < estimators.size(); i++) {
IntrinsicDimensionalityEstimator est = estimators.get(i);
v[i][p] = est.estimate(dists, l);
}
}
switch(format){
case TABULAR:
out.append(String.format("%0" + digits + "d", l));
for(int i = 0; i < estimators.size(); i++) {
for(double val : agg.aggregate(v[i])) {
out.format(Locale.ROOT, " %10f", val);
}
}
out.append(FormatUtil.NEWLINE);
break;
case TSV:
out.append(FormatUtil.NF.format(l));
for(int i = 0; i < estimators.size(); i++) {
for(double val : agg.aggregate(v[i])) {
out.append('\t');
out.append(FormatUtil.NF.format(val));
}
}
out.append(FormatUtil.NEWLINE);
break;
}
}
}
/**
* Generate a data sample.
*
* @param maxk Number of entries.
* @return Data sample
*/
protected double[] makeSample(int maxk) {
final Random rnd = this.rnd.getSingleThreadedRandom();
double[] dists = new double[maxk + 1];
final double e = 1. / dim;
for(int i = 0; i <= maxk; i++) {
dists[i] = Math.pow(rnd.nextDouble(), e);
}
Arrays.sort(dists);
return dists;
}
/**
* Output format
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
public static enum OutputFormat {
TABULAR, TSV
}
/**
* Aggregation methods.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
public static enum Aggregate {
/**
* Aggregate using the mean only.
*/
MEAN {
@Override
double[] aggregate(double[] data) {
double avg = 0.;
for(double val : data) {
avg += val;
}
return new double[] { avg / data.length };
}
@Override
String[] description() {
return new String[] { "Mean" };
}
},
/**
* Aggregate as mean and standard deviation.
*/
MEAN_STDDEV {
@Override
double[] aggregate(double[] data) {
double avg = 0.;
for(double val : data) {
avg += val;
}
avg /= data.length;
double sqsum = 0.;
for(double val : data) {
double v = val - avg;
sqsum += v * v;
}
sqsum /= data.length;
return new double[] { avg, Math.sqrt(sqsum) };
}
@Override
String[] description() {
return new String[] { "Mean", "Stddev" };
}
},
/**
* Aggregate as mean and standard deviation.
*/
MEAN_STDDEV_MIN_MAX {
@Override
double[] aggregate(double[] data) {
double avg = 0.;
double min = Double.POSITIVE_INFINITY, max = Double.NEGATIVE_INFINITY;
for(double val : data) {
avg += val;
min = (val < min) ? val : min;
max = (val > max) ? val : max;
}
avg /= data.length;
double sqsum = 0.;
for(double val : data) {
double v = val - avg;
sqsum += v * v;
}
sqsum /= data.length;
return new double[] { avg, Math.sqrt(sqsum), min, max };
}
@Override
String[] description() {
return new String[] { "Mean", "Stddev", "Min", "Max" };
}
},
/**
* Harmonic mean.
*/
HMEAN {
@Override
double[] aggregate(double[] data) {
double avg = 0.;
for(double val : data) {
avg += 1. / val;
}
return new double[] { data.length / avg };
}
@Override
String[] description() {
return new String[] { "HMean" };
}
},
/**
* Aggregate using median.
*/
MEDIAN {
@Override
double[] aggregate(double[] data) {
double med = QuickSelect.median(data);
return new double[] { med };
}
@Override
String[] description() {
return new String[] { "Median" };
}
},
/**
* Aggregate using median and MAD.
*/
MED_MAD {
@Override
double[] aggregate(double[] data) {
double med = QuickSelect.median(data);
double[] devs = new double[data.length];
for(int i = 0; i < data.length; i++) {
devs[i] = Math.abs(data[i] - med);
}
double mad = QuickSelect.median(devs);
return new double[] { med, mad };
}
@Override
String[] description() {
return new String[] { "Med", "Mad" };
}
},
/**
* Aggregate using median and MAD.
*/
MED_MAD_MIN_MAX {
@Override
double[] aggregate(double[] data) {
double med = QuickSelect.median(data);
double[] devs = new double[data.length];
double min = med, max = med;
for(int i = 0; i < data.length; i++) {
double v = data[i];
min = (v < min) ? v : min;
max = (v > max) ? v : max;
devs[i] = Math.abs(v - med);
}
double mad = QuickSelect.median(devs);
return new double[] { med, mad, min, max };
}
@Override
String[] description() {
return new String[] { "Med", "Mad", "Min", "Max" };
}
},
/**
* Selected quantiles.
*/
QUANTILES {
@Override
double[] aggregate(double[] data) {
final double[] quants = { 0, .1, .25, .5, .75, .9, 1. };
final int l = data.length;
Arrays.sort(data); // QuickSelect would do, actually
double[] ret = new double[quants.length];
for(int i = 0; i < quants.length; i++) {
final double dleft = (l - 1) * quants[i];
final int ileft = (int) Math.floor(dleft);
final double err = dleft - ileft;
if(err < Double.MIN_NORMAL) {
ret[i] = data[ileft];
}
else {
ret[i] = data[ileft] + (data[ileft + 1] - data[ileft]) * err;
}
}
return ret;
}
@Override
String[] description() {
return new String[] { "Min", "Q10", "Q25", "Med", "Q75", "Q90", "Max" };
}
},
// Last alternative.
;
/**
* Aggregate values.
*
* @param data Data to aggregate.
* @return Aggregated values.
*/
abstract double[] aggregate(double[] data);
/**
* Descriptions of the aggregate values.
*
* @return Descriptions
*/
abstract String[] description();
}
/**
* Main method
*/
public static void main(String[] args) {
runCLIApplication(EvaluateIntrinsicDimensionalityEstimators.class, args);
}
/**
* Parameterization class.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
public static class Parameterizer extends AbstractApplication.Parameterizer {
/**
* Initial neighborhood size.
*/
public static final OptionID STARTK_ID = new OptionID("mink", "Minimum value of k.");
/**
* Final neighborhood size.
*/
public static final OptionID MAXK_ID = new OptionID("maxk", "Maximum value of k.");
/**
* Samples size.
*/
public static final OptionID SAMPLE_ID = new OptionID("sample", "Sample size for averaging.");
/**
* Dimensionality.
*/
public static final OptionID DIM_ID = new OptionID("dim", "Dimensionality.");
/**
* Random seed.
*/
public static final OptionID SEED_ID = new OptionID("seed", "Random seed.");
/**
* Aggregation method.
*/
public static final OptionID AGGREGATE_ID = new OptionID("aggregation", "Aggregation method.");
/**
* Output format.
*/
public static final OptionID FORMAT_ID = new OptionID("output-format", "Output format (ascii, or tab separated).");
/**
* Benchmark parameters.
*/
int startk = 3, maxk = 10, samples = 1000, dim = 5;
/**
* Aggregation method.
*/
Aggregate agg;
/**
* Output format parameter.
*/
OutputFormat format;
/**
* Random generator.
*/
RandomFactory rnd;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntParameter startP = new IntParameter(STARTK_ID, 3);
if(config.grab(startP)) {
startk = startP.intValue();
}
IntParameter maxkP = new IntParameter(MAXK_ID, 20);
if(config.grab(maxkP)) {
maxk = maxkP.intValue();
}
IntParameter samplesP = new IntParameter(SAMPLE_ID, 1000);
if(config.grab(samplesP)) {
samples = samplesP.intValue();
}
RandomParameter seedP = new RandomParameter(SEED_ID);
if(config.grab(seedP)) {
rnd = seedP.getValue();
}
IntParameter dimP = new IntParameter(DIM_ID);
if(config.grab(dimP)) {
dim = dimP.intValue();
}
EnumParameter aggP = new EnumParameter<>(AGGREGATE_ID, Aggregate.class, Aggregate.MED_MAD);
if(config.grab(aggP)) {
agg = aggP.getValue();
}
EnumParameter formatP = new EnumParameter<>(FORMAT_ID, OutputFormat.class, OutputFormat.TABULAR);
if(config.grab(formatP)) {
format = formatP.getValue();
}
}
@Override
protected EvaluateIntrinsicDimensionalityEstimators makeInstance() {
return new EvaluateIntrinsicDimensionalityEstimators(startk, maxk, samples, dim, agg, format, rnd);
}
}
}