123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317 |
- package BHStats;
- require Exporter;
- our @ISA = qw (Exporter);
- our @EXPORT = qw (binomial_probability_sum_k_to_n
- binomial_probability_sum_k_to_0
- binomial_probability
- binomial_coefficient
- factorial
- stDev
- stdErr
- median
- avg
- CorrelationCoeff
- geometric_mean
- min
- max
- sum
- tukey_biweight
- );
- use strict;
- sub binomial_probability_sum_k_to_n {
- my ($n,$k,$p) = @_;
- my $sum = 0;
- for (my $i = $k; $i <= $n; $i++) {
- my $binProb = binomial_probability($n,$i,$p);
- $sum += $binProb;
- }
- return ($sum);
- }
- sub binomial_probability_sum_k_to_0 {
- my ($n,$k,$p) = @_;
- my $sum = 0;
- for (my $i = $k; $i >= 0; $i--) {
- my $binProb = binomial_probability($n,$i,$p);
- $sum += $binProb;
- }
- return ($sum);
- }
- sub binomial_probability {
- my ($n_observations, $k_successes, $p_probability) = @_;
-
- my ($n, $k, $p) = ($n_observations, $k_successes, $p_probability);
-
- ### Given B(n,p), find P(X=k)
-
- my $binomial_prob = binomial_coefficient($n,$k) * ($p**$k) * (1-$p)**($n-$k);
-
- return ($binomial_prob);
-
- }
- sub binomial_coefficient {
- my ($n_things, $k_at_a_time) = @_;
-
- my $number_of_k_arrangements = (factorial($n_things)) / ( factorial($k_at_a_time) * factorial($n_things-$k_at_a_time) );
- return ($number_of_k_arrangements);
- }
- sub factorial {
- my $x = shift;
- $x = int($x);
- my $factorial = 1;
- while ($x > 1) {
- $factorial *= $x;
- $x--;
- }
- return ($factorial);
- }
- sub stDev {
- # standard deviation calculation
- my @nums = @_;
- @nums = sort {$a<=>$b} @nums;
-
-
- my $avg = avg(@nums);
- my $count_eles = scalar(@nums);
-
- ## sum up the sqr of diff from avg
- my $sum_avg_diffs_sqr = 0;
- foreach my $num (@nums) {
- my $diff = $num - $avg;
- my $sqr = $diff**2;
- $sum_avg_diffs_sqr += $sqr;
- }
- my $stdev = sqrt ($sum_avg_diffs_sqr/($count_eles-1));
- return ($stdev);
- }
- ####
- sub stdErr {
- my @vals = @_;
- my $stdev = &stDev(@vals);
-
- my $num_vals = scalar(@vals);
- my $stdErr = $stdev / sqrt($num_vals);
- return($stdErr);
- }
- sub median {
- my @nums = @_;
-
- @nums = sort {$a<=>$b} @nums;
-
- my $count = scalar (@nums);
- if ($count %2 == 0) {
- ## even number:
- my $half = $count / 2;
- return (avg ($nums[$half-1], $nums[$half]));
- }
- else {
- ## odd number. Return middle value
- my $middle_index = int($count/2);
- return ($nums[$middle_index]);
- }
- }
- sub avg {
- my @nums = @_;
- my $total = $#nums + 1;
- my $sum = 0;
- foreach my $num (@nums) {
- $sum += $num;
- }
- my $avg = $sum/$total;
- return ($avg);
- }
- sub CorrelationCoeff {
- my ($x_aref, $y_aref) = @_;
- my @x = @$x_aref;
- my @y = @$y_aref;
-
- my $total = $#x + 1;
- my $avg_x = avg(@x);
- my $avg_y = avg(@y);
-
- my $stdev_x = stDev(@x);
- my $stdev_y = stDev(@y);
-
- # sum part of equation
- my $summation = 0;
- for (my $i = 0; $i < $total; $i++) {
- my $x_val = $x[$i];
- my $y_val = $y[$i];
-
- my $x_part = ($x_val - $avg_x)/$stdev_x;
- my $y_part = ($y_val - $avg_y)/$stdev_y;
-
- $summation += ($x_part * $y_part);
- }
-
- my $cor = (1/($total-1)) * $summation;
-
- return ($cor);
- }
- ####
- sub geometric_mean {
- my @entries = @_;
-
- my $num_entries = scalar (@entries);
- unless ($num_entries) {
- return (undef);
- }
-
- ## All entries must be > 0
- my $logsum = 0;
- foreach my $entry (@entries) {
- unless ($entry > 0) {
- return (undef);
- }
- $logsum += log ($entry);
- }
-
- my $geo_mean = exp ( (1/$num_entries) * $logsum);
-
- return ($geo_mean);
- }
- ####
- sub min {
- my @vals = @_;
-
- @vals = sort {$a<=>$b} @vals;
-
- my $min_val = shift @vals;
-
- return ($min_val);
- }
- ####
- sub max {
- my @vals = @_;
-
- @vals = sort {$a<=>$b} @vals;
-
- my $max_val = pop @vals;
-
- return ($max_val);
- }
- ####
- sub sum {
- my @vals = @_;
- my $x = 0;
- foreach my $val (@vals) {
- $x += $val;
- }
-
- return ($x);
- }
- =Rcode
- > tukey.biweight
- function (x, c = 5, epsilon = 1e-04)
- {
- m <- median(x)
- s <- median(abs(x - m))
- u <- (x - m)/(c * s + epsilon)
- w <- rep(0, length(x))
- i <- abs(u) <= 1
- w[i] <- ((1 - u^2)^2)[i]
- t.bi <- sum(w * x)/sum(w)
- return(t.bi)
- }
- =cut
- ####
- sub tukey_biweight {
- my (@x) = @_;
- my $m = median(@x);
-
- my $s;
- {
- my @y;
- foreach my $val (@x) {
- my $t = abs($val - $m);
- push (@y, $t);
- }
- $s = median(@y);
- }
-
- my @u;
- {
- my $epsilon = 1e-4;
- my $c = 5;
-
- foreach my $val (@x) {
- my $t = $val - $m;
- $t /= ($c * $s + $epsilon);
- push (@u, $t);
- }
- }
-
- my @i;
- {
- foreach my $val (@u) {
- my $t = (abs($val) <= 1) ? 1:0;
- push (@i, $t);
- }
- }
- my @w;
- {
- foreach my $val (@u) {
- my $i = shift @i;
- my $t = ( (1 - $val**2) **2) * $i;
- push (@w, $t);
- }
- }
-
- my $bi;
- {
- my $sum_w = 0;
- for (my $i = 0; $i < @w; $i++) {
- my $w = $w[$i];
- my $x = $x[$i];
- $bi += $w * $x;
- $sum_w += $w;
- }
- $bi /= $sum_w;
- }
- return($bi);
- }
- 1; #EOM
|