package de.lmu.ifi.dbs.elki.math.statistics; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2011 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ /** * QuickSelect computes ("selects") the element at a given rank and can be used * to compute Medians and arbitrary quantiles by computing the appropriate rank. * * This algorithm is essentially an incomplete QuickSort that only descends into * that part of the data that we are interested in, and also attributed to * Charles Antony Richard Hoare * * @author Erich Schubert */ public class QuickSelect { /** * For small arrays, use a simpler method: */ private static final int SMALL = 10; /** * QuickSelect is essentially quicksort, except that we only "sort" that half * of the array that we are interested in. * * Note: the array is modified by this. * * @param data Data to process * @param rank Rank position that we are interested in (integer!) * @return Value at the given rank */ public static double quickSelect(double[] data, int rank) { quickSelect(data, 0, data.length - 1, rank); return data[rank]; } /** * Compute the median of an array efficiently using the QuickSelect method. * * Note: the array is modified by this. * * @param data Data to process * @return Median value */ public static double median(double[] data) { return median(data, 0, data.length - 1); } /** * Compute the median of an array efficiently using the QuickSelect method. * * Note: the array is modified by this. * * @param data Data to process * @param begin Begin of valid values * @param end End of valid values (inclusive!) * @return Median value */ public static double median(double[] data, int begin, int end) { final int length = (end + 1) - begin; assert (length > 0); // Integer division is "floor" since we are non-negative. final int left = begin + (length - 1) / 2; quickSelect(data, begin, end, left); if(length % 2 == 1) { return data[left]; } else { quickSelect(data, begin, end, left + 1); return data[left] + (data[left + 1] - data[left]) / 2; } } /** * Compute the median of an array efficiently using the QuickSelect method. * * Note: the array is modified by this. * * @param data Data to process * @param quant Quantile to compute * @return Value at quantile */ public static double quantile(double[] data, double quant) { return quantile(data, 0, data.length - 1, quant); } /** * Compute the median of an array efficiently using the QuickSelect method. * * Note: the array is modified by this. * * @param data Data to process * @param begin Begin of valid values * @param end End of valid values (inclusive!) * @param quant Quantile to compute * @return Value at quantile */ public static double quantile(double[] data, int begin, int end, double quant) { final int length = (end + 1) - begin; assert (length > 0) : "Quantile on empty set?"; // Integer division is "floor" since we are non-negative. final double dleft = begin + (length - 1) * quant; final int ileft = (int) Math.floor(dleft); final double err = dleft - ileft; quickSelect(data, begin, end, ileft); if(err <= Double.MIN_NORMAL) { return data[ileft]; } else { quickSelect(data, begin, end, ileft + 1); // Mix: double mix = data[ileft] + (data[ileft + 1] - data[ileft]) * err; return mix; } } /** * QuickSelect is essentially quicksort, except that we only "sort" that half * of the array that we are interested in. * * @param data Data to process * @param start Interval start * @param end Interval end (inclusive) * @param rank rank position we are interested in (starting at 0) */ public static void quickSelect(double[] data, int start, int end, int rank) { // Optimization for small arrays // This also ensures a minimum size below if(start + SMALL > end) { insertionSort(data, start, end); return; } // Pick pivot from three candidates: start, middle, end // Since we compare them, we can also just "bubble sort" them. final int middle = (start + end) / 2; if(data[start] > data[middle]) { swap(data, start, middle); } if(data[start] > data[end]) { swap(data, start, end); } if(data[middle] > data[end]) { swap(data, middle, end); } // TODO: use more candidates for larger arrays? final double pivot = data[middle]; // Move middle element out of the way, just before end // (Since we already know that "end" is bigger) swap(data, middle, end - 1); // Begin partitioning int i = start + 1, j = end - 2; // This is classic quicksort stuff while(true) { while(data[i] <= pivot && i <= j) { i++; } while(data[j] >= pivot && j >= i) { j--; } if(i >= j) { break; } swap(data, i, j); } // Move pivot (former middle element) back into the appropriate place swap(data, i, end - 1); // In contrast to quicksort, we only need to recurse into the half we are // interested in. if(rank < i) { quickSelect(data, start, i - 1, rank); } else if(rank > i) { quickSelect(data, i + 1, end, rank); } } /** * Sort a small array using repetitive insertion sort. * * @param data Data to sort * @param start Interval start * @param end Interval end */ private static void insertionSort(double[] data, int start, int end) { for(int i = start + 1; i <= end; i++) { for(int j = i; j > start && data[j - 1] > data[j]; j--) { swap(data, j, j - 1); } } } /** * The usual swap method. * * @param data Array * @param a First index * @param b Second index */ private static final void swap(double[] data, int a, int b) { double tmp = data[a]; data[a] = data[b]; data[b] = tmp; } }