;; statistics.lsp -- simple statistics functions ;; to compute statistics, create an object: ;; (setf stats (send statistics-class :new t)) ;; use t to retain the data and nil to not retain the data ;; then call (send stats :point x) for each x in the data set ;; call (send stats :print-stats) to print some statistics ;; see methods below for other methods, e.g. :get-mean ;; ;; to compute histograms, see comments below (setf statistics-class (send class :new '(count sum sum-sqr max min retain data))) (send statistics-class :answer :isnew '(ret) '((send self :init ret))) (send statistics-class :answer :init '(ret) '( (setf count 0 sum 0 sum-sqr 0 data nil max nil min nil retain ret data nil))) (send statistics-class :answer :point '(x) '( (incf count) (setf sum (+ sum x)) (setf sum-sqr (+ sum-sqr (* x x))) (setf max (if max (max max x) x)) (setf min (if min (min min x) x)) (if retain (push x data)))) (send statistics-class :answer :get-count '() '(count)) (send statistics-class :answer :get-data '() '(data)) (send statistics-class :answer :get-min '() '(min)) (send statistics-class :answer :get-max '() '(max)) (send statistics-class :answer :get-mean '() '( (if (> count 0) (/ (float sum) count) nil))) (send statistics-class :answer :get-stddev '() '( (if (> count 1) (sqrt (send self :get-variance)) nil))) (send statistics-class :answer :get-variance '() '( (if (> count 1) (/ (- sum-sqr (/ (* sum sum) (float count))) (1- count)) nil))) (send statistics-class :answer :print-stats '() '( (format t "Number of points: ~A~%Max: ~A~%Min: ~A~%" count max min) (if retain (format t "Median: ~A~%" (send self :get-median))) (format t "Mean: ~A~%Std.Dev.: ~A~%" (send self :get-mean) (send self :get-stddev)) )) (send statistics-class :answer :get-data '() '(data)) (send statistics-class :answer :get-median '() '( (let (i) (cond ((not retain) nil) ;; no data retained to examine ((< count 1) nil) ;; no data to compute from (t (setf data (bigsort data '<)) (cond ((oddp count) (nth (/ count 2) data)) (t (setf i (/ count 2)) (* 0.5 (+ (nth i data) (nth (1- i) data)))))))))) ;; This is the "usual estimator of the population kurtosis" based ;; on Wikipedia. In order for this to work, the statistics object ;; must be initialized to *retain* the data ;; (send statistics-class :answer :get-kurtosis '() '( (let ((x4 0) x2 (n (float count)) ; "n" is just a new name for count (mean (send self :get-mean)) (variance (send self :get-variance))) (dolist (x data) (setf x2 (* (- x mean) (- x mean))) (setf x4 (+ x4 (* x2 x2)))) (display "kurtosis" x4 (* variance variance) n) (if (> n 3) (- (* (/ (* (1+ n) n) (* (1- n) (- n 2) (- n 3))) (/ x4 (* variance variance))) (/ (* 3 (1- n) (1- n)) (* (- n 2) (- n 3)))) nil)))) ;; :FRACTION-IN-RANGE -- proportion of values in a range ;; (send statistics-class :answer :fraction-in-range '(low high) '( (let ((n 0)) (dolist (d data) (if (and (<= low d) (< d high)) (setf n (1+ n)))) (/ (float n) count)))) ;; The histogram-class. Make a histogram from data. ;; ;; To use histogram-class, first make an instance: ;; (setf my-histogram (send histogram-class :new)) ;; Then add points to the histogram. For each point x: ;; (send my-histogram :point x) ;; You can make a default histogram by calling: ;; (send my-histogram :configure-bins) ;; This will create the square root of N bins where N is the ;; number of points. The bins are evenly distributed across ;; the range of the data. ;; Alternatively, you can provide your own thresholds to ;; determine the bins by calling: ;; (send my-histogram :set-thresholds an-array) ;; Each element of an-array represents the lower bound for ;; elements in that bin. E.g. if x is a point, it goes in ;; bin 3 if (aref an-array 3) <= x < (aref an-array 4) ;; Note that nothing goes into bin L-1 where L is the length ;; of an-array. ;; To actually compute the histogram, call ;; (send my-histogram :make-hist) ;; And then you can print or plot it with: ;; (send my-histogram :print-hist) or ;; (send my-histogram :plot-hist) ;; You can change the thresholds with :set-thresholds or ;; :configure-bins without re-inserting all the points. ;; You can start over by calling ;; (send my-histogram :init) ;; but this probably has no advantage over making a new ;; instance. (setf histogram-class (send class :new '(stats counts thresholds))) (send histogram-class :answer :isnew '() '((send self :init))) (send histogram-class :answer :init '() '( (setf counts nil thresholds nil) ; create stats object and tell it to retain points (setf stats (send statistics-class :new t)))) (send histogram-class :answer :point '(x) '( (send stats :point x))) (send histogram-class :answer :configure-bins '() '( (let* ((nbins (round (sqrt (float (send stats :get-count))))) (minthreshold (send stats :get-min)) (step (/ (- (send stats :get-max) (send stats :get-min)) nbins))) (setf thresholds (make-array (1+ nbins))) (dotimes (i (1+ nbins)) (setf (aref thresholds i) (+ minthreshold (* i step))))) thresholds)) (send histogram-class :answer :set-thresholds '(array) '( (setf counts nil) (setf thresholds array))) (send histogram-class :answer :make-hist '(&key (verbose t)) '( (let* ((data (send stats :get-data)) (counter 0) (data-position 0)) (if (null thresholds) (send self :configure-bins)) (cond ((null counts) (setf counts (make-array (1- (length thresholds)))) (dotimes (i (length counts)) (setf (aref counts i) 0)))) (dolist (x data) (cond ((and verbose (> counter 100000)) (format t "make-hist ~A% done\n" (* 100 (/ data-position (float (send stats :get-count))))) (setf counter 0))) ; increment the right bin -- allows different bin sizes but ; could use a binary search for the right bin (dotimes (i (length counts)) (incf counter) (cond ((and (< x (aref thresholds (1+ i))) (>= x (aref thresholds i))) (incf (aref counts i)) (return)))) (incf data-position)) ))) (send histogram-class :answer :print-hist '() '( (if (null counts) (send self :make-hist)) (dotimes (i (length counts)) (format t "~A to ~A: ~A~%" (aref thresholds i) (aref thresholds (1+ i)) (aref counts i))))) (send histogram-class :answer :plot-hist '(&optional (offset 0)) '( (if (null counts) (send self :make-hist)) (s-plot (snd-from-array 0 (/ (- (aref thresholds 1) (aref thresholds 0))) counts)))) (send histogram-class :answer :get-min '() '( (send stats :get-min))) (send histogram-class :answer :get-max '() '( (send stats :get-max))) (send histogram-class :answer :get-count '() '( (send stats :get-count))) (send histogram-class :answer :get-counts '() '( counts)) (send histogram-class :answer :get-thresholds '() '( thresholds)) ;; Pearson correlation - direct (unstable) algorithm ;; ;; I created this to get the "true" answer when I was trying to ;; debug the more complex version below. All three algorithms here ;; now agree (within numerical roundoff), and I believe the ;; pearson-class below is the best implementation. -RBD ;; ;(setf upearson-class (send class :new '(sumxy sumx sumy sumxx sumyy n))) ; ;(send upearson-class :answer :isnew '() '((send self :init))) ;(send upearson-class :answer :init '() '( ; (setf sumxy 0 sumx 0 sumy 0 sumxx 0 sumyy 0 n 0))) ;(send upearson-class :answer :points '(x y) '( ; (setf sumxy (+ sumxy (* x y))) ; (setf sumx (+ sumx x)) ; (setf sumy (+ sumy y)) ; (setf sumxx (+ sumxx (* x x))) ; (setf sumyy (+ sumyy (* y y))) ; (setf n (+ n 1)))) ;(send upearson-class :answer :correlation '() '( ; (/ (- (* n sumxy) (* sumx sumy)) ; (* (sqrt (- (* n sumxx) (* sumx sumx))) ; (sqrt (- (* n sumyy) (* sumy sumy))))))) ;; Pearson correlation ;; (setf pearson-class (send class :new '(sum-sq-x sum-sq-y sum-coproduct mean-x mean-y n))) (send pearson-class :answer :isnew '() '((send self :init))) (send pearson-class :answer :init '() '( (setf n 0) (setf sum-sq-x 0 sum-sq-y 0 sum-coproduct 0))) (send pearson-class :answer :points '(x y) '( (cond ((zerop n) (setf mean-x x mean-y y n 1)) (t (setf n (1+ n)) (let* ((sweep (/ (- n 1.0) n)) (delta-x (- x mean-x)) (delta-y (- y mean-y))) (setf sum-sq-x (+ sum-sq-x (* delta-x delta-x sweep))) (setf sum-sq-y (+ sum-sq-y (* delta-y delta-y sweep))) (setf sum-coproduct (+ sum-coproduct (* delta-x delta-y sweep))) (setf mean-x (+ mean-x (/ delta-x n))) (setf mean-y (+ mean-y (/ delta-y n)))))))) (send pearson-class :answer :correlation '() '( (let* ((pop-sd-x (sqrt (/ sum-sq-x n))) (pop-sd-y (sqrt (/ sum-sq-y n))) (cov-x-y (/ sum-coproduct n))) (/ cov-x-y (* pop-sd-x pop-sd-y))))) ;; This is a very direct implementation of the algorithm below, ;; but it stores the points -- I created this for debugging but ;; I don't see any reason to use it now. -RBD ;(setf npearson-class (send class :new '(pts))) ;(send npearson-class :answer :isnew '() '((send self :init))) ;(send npearson-class :answer :init '() '((setf pts nil))) ;(send npearson-class :answer :points '(x y) '( ; (setf pts (cons (cons x y) pts)))) ;(send npearson-class :answer :correlation '() '( ; (setf pts (reverse pts)) ; (let ((sum-sq-x 0) (sum-sq-y 0) (sum-coproduct 0) (mean-x (caar pts)) ; (mean-y (cdar pts)) i (n (length pts))) ; (dotimes (j (1- n)) ; (let* ((i (+ j 2)) ; (sweep (/ (- i 1.0) i)) ; (delta-x (- (car (nth (1- i) pts)) mean-x)) ; (delta-y (- (cdr (nth (1- i) pts)) mean-y))) ; (setf sum-sq-x (+ sum-sq-x (* delta-x delta-x sweep))) ; (setf sum-sq-y (+ sum-sq-y (* delta-y delta-y sweep))) ; (setf sum-coproduct (+ sum-coproduct (* delta-x delta-y sweep))) ; (setf mean-x (+ mean-x (/ delta-x i))) ; (setf mean-y (+ mean-y (/ delta-y i))))) ; (let ((pop-sd-x (sqrt (/ sum-sq-x n))) ; (pop-sd-y (sqrt (/ sum-sq-y n))) ; (cov-x-y (/ sum-coproduct n))) ; (/ cov-x-y (* pop-sd-x pop-sd-y)))))) ;; the algorithm (from Wikipedia) ;sum_sq_x = 0 ;sum_sq_y = 0 ;sum_coproduct = 0 ;mean_x = x[1] ;mean_y = y[1] ;for i in 2 to N: ; sweep = (i - 1.0) / i ; delta_x = x[i] - mean_x ; delta_y = y[i] - mean_y ; sum_sq_x += delta_x * delta_x * sweep ; sum_sq_y += delta_y * delta_y * sweep ; sum_coproduct += delta_x * delta_y * sweep ; mean_x += delta_x / i ; mean_y += delta_y / i ;pop_sd_x = sqrt( sum_sq_x / N ) ;pop_sd_y = sqrt( sum_sq_y / N ) ;cov_x_y = sum_coproduct / N ;correlation = cov_x_y / (pop_sd_x * pop_sd_y) ;; Welch's t-test to test the null hypothesis that 2 population means are ;; equal when the variances might be unequal ;; ;; returns list: (welchs-t degrees-of-freedom) ;; (defun welchs-t-test (mean1 stddev1 n1 mean2 stddev2 n2) (let* ((var1 (* stddev1 stddev1)) (var2 (* stddev2 stddev2)) (num (- mean1 mean2)) (den (sqrt (+ (/ var1 n1) (/ var2 n2)))) (welchs-t (/ num den)) (dof-a (+ (/ var1 n1) (/ var2 n2))) (dof-num (* dof-a dof-a)) (dof-den (+ (/ (* var1 var1) (* n1 n1 (- n1 1))) (/ (* var2 var2) (* n2 n2 (- n2 1))))) (dof (/ dof-num dof-den))) (list welchs-t dof))) ;; Levene's test to assess the equality of variances in different samples ;; based on Wikipedia article. This implementation is for 2 groups. If the ;; 2 groups can be assumed to be normal (Gaussian), then the F-test should ;; be considered. ;; ;; A variation on Levene's test is the Brown-Forsythe test, which uses ;; medians instead of means. The optional parameter, brown-forsythe can ;; be set to true to get a Browne-Forsythe test instead of Levene's test. ;; ;; The verbose flag defaults to t and prints some useful information ;; ;; The input to levenes-test is a pair of lists of samples. The return ;; value is W (see Wikipedia for details) ;; (defun levenes-test (y1 y2 &optional brown-forsythe (verbose t)) (let* ((n1 (float (length y1))) (n2 (float (length y2))) (n (+ n1 n2)) m1 m2 z1 z2 z.. z1. z2. stat (den 0) w) ;; compute means or medians (cond (brown-forsythe (setf m1 (vector-median y1)) (setf m2 (vector-median y2))) (t (setf m1 (vector-mean y1)) (setf m2 (vector-mean y2)))) ;; compute zij (lists z1 and z2) (dolist (y1j y1) (push (abs (- y1j m1)) z1)) (dolist (y2j y2) (push (abs (- y2j m2)) z2)) ;; compute zi. sums (setf z1. (vector-sum-elements z1)) (setf z2. (vector-sum-elements z2)) ;; compute z.. (setf z.. (/ (+ z1. z2.) n)) ;; convert zi. variables from sums to means (setf z1. (/ z1. n1)) (setf z2. (/ z2. n2)) ;; compute the big denominator term (dolist (z1j z1) (let ((diff (- z1j z1.))) (setf den (+ den (* diff diff))))) (dolist (z2j z2) (let ((diff (- z2j z2.))) (setf den (+ den (* diff diff))))) ;; compute w (setf w (* (- n 2) (/ (+ (* n1 (* (- z1. z..) (- z1. z..))) (* n2 (* (- z2. z..) (- z2. z..)))) den))) ;; print info if verbose (cond (verbose (format t "Summary of ~A test results: Size of group 1: ~A, ~A: ~A Size of group 2: ~A, ~A: ~A W (result): ~A The significance of W is tested against F(alpha, 1, ~A), where alpha is the level of significance (usually 0.05 or 0.01), and ~A is N-2.~%" (if brown-forsythe "Brown-Forsythe" "Levene's") n1 (if brown-forsythe "Median" "Mean") m1 n2 (if brown-forsythe "Median" "Mean") m2 w (- n 2) (- n 2)))) w)) ;; a simple test for levenes-test ;; this program uses distributions.lsp, which must be explicitly loaded ;; (defun levenes-test-test () (let (y1 y2 y3) ;; make some data with sigma 0.1 and 0.2 (dotimes (i 50) (push (gaussian-dist 1.0 0.1) y1)) (dotimes (i 75) (push (gaussian-dist 1.0 0.2) y2)) (dotimes (i 75) (push (gaussian-dist 1.0 0.1) y3)) (format t "\nTHE FOLLOWING HAVE UNEQUAL VARIANCE\n") (levenes-test y1 y2) ;; levene's test (format t "\n") (levenes-test y1 y2 t) ;; brown-forsythe test (format t "\nTHE FOLLOWING HAVE EQUAL VARIANCE\n") (levenes-test y1 y3) ;; levene's test (format t "\n") (levenes-test y1 y3 t) ;; brown-forsythe test (format t "\n") 'done ))