package de.lmu.ifi.dbs.elki.math.statistics.distribution.estimator.meta; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution; import de.lmu.ifi.dbs.elki.math.statistics.distribution.estimator.DistributionEstimator; import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect; import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil; import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** * Winsorising or Georgization estimator. Similar to trimming, this is expected * to be more robust to outliers. However, instead of removing the extreme * values, they are instead replaced with the cutoff value. This keeps the * quantity of the data the same, and will have a lower impact on variance and * similar measures. * * Reference: *

* C. Hastings, F. Mosteller, J. W. Tukey, C. P. Winsor
* Low moments for small samples: a comparative study of order statistics.
* The Annals of Mathematical Statistics, 18(3) * *

* * @author Erich Schubert * * @apiviz.uses DistributionEstimator * * @param Distribution type */ @Reference(authors = "C. Hastings, F. Mosteller, J. W. Tukey, C. P. Winsor", title = "Low moments for small samples: a comparative study of order statistics", booktitle = "The Annals of Mathematical Statistics, 18(3)", url = "http://dx.doi.org/10.1214/aoms/1177730388") public class WinsorisingEstimator implements DistributionEstimator { /** * Distribution estimator to use. */ private DistributionEstimator inner; /** * Amount of data to winsorize. */ private double winsorize; /** * Constructor. * * @param inner Inner estimator. * @param winsorize Winsorize parameter. */ public WinsorisingEstimator(DistributionEstimator inner, double winsorize) { super(); this.inner = inner; this.winsorize = winsorize; } @Override public D estimate(A data, NumberArrayAdapter adapter) { // We first need the basic parameters: int len = adapter.size(data); final int cut = ((int) (len * winsorize)) >> 1; // X positions of samples double[] x = new double[len]; for(int i = 0; i < len; i++) { final double val = adapter.getDouble(data, i); x[i] = val; } // Partially sort our copy. double min = QuickSelect.quickSelect(x, 0, len, cut); double max = QuickSelect.quickSelect(x, cut, len, len - 1 - cut); // Winsorize by replacing the smallest and largest values. // QuickSelect ensured that these are correctly in place. for(int i = 0, j = len - 1; i < cut; i++, j--) { x[i] = min; x[j] = max; } return inner.estimate(x, ArrayLikeUtil.DOUBLEARRAYADAPTER); } @Override public Class getDistributionClass() { return inner.getDistributionClass(); } @Override public String toString() { return this.getClass().getSimpleName() + "(" + inner.toString() + ", trim=" + winsorize + ")"; } /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude * * @param Distribution type */ public static class Parameterizer extends AbstractParameterizer { /** * Option for the class to use on the winsorized sample. */ public static final OptionID INNER_ID = new OptionID("winsorize.inner", "Estimator to use on the winsorized data."); /** * Option for specifying the amount of data to winsorize. */ public static final OptionID WINSORIZE_ID = new OptionID("winsorize.winsorize", "Relative amount of data to winsorize on each end, must be 0 < winsorize < 0.5"); /** * Distribution estimator to use. */ private DistributionEstimator inner; /** * Amount of data to winsorize. */ private double winsorize; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); ObjectParameter> innerP = new ObjectParameter<>(INNER_ID, DistributionEstimator.class); if(config.grab(innerP)) { inner = innerP.instantiateClass(config); } DoubleParameter trimP = new DoubleParameter(WINSORIZE_ID); trimP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); trimP.addConstraint(CommonConstraints.LESS_THAN_HALF_DOUBLE); if(config.grab(trimP)) { winsorize = trimP.doubleValue(); } } @Override protected WinsorisingEstimator makeInstance() { return new WinsorisingEstimator<>(inner, winsorize); } } }