BHStats.pm 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. package BHStats;
  2. require Exporter;
  3. our @ISA = qw (Exporter);
  4. our @EXPORT = qw (binomial_probability_sum_k_to_n
  5. binomial_probability_sum_k_to_0
  6. binomial_probability
  7. binomial_coefficient
  8. factorial
  9. stDev
  10. stdErr
  11. median
  12. avg
  13. CorrelationCoeff
  14. geometric_mean
  15. min
  16. max
  17. sum
  18. tukey_biweight
  19. );
  20. use strict;
  21. sub binomial_probability_sum_k_to_n {
  22. my ($n,$k,$p) = @_;
  23. my $sum = 0;
  24. for (my $i = $k; $i <= $n; $i++) {
  25. my $binProb = binomial_probability($n,$i,$p);
  26. $sum += $binProb;
  27. }
  28. return ($sum);
  29. }
  30. sub binomial_probability_sum_k_to_0 {
  31. my ($n,$k,$p) = @_;
  32. my $sum = 0;
  33. for (my $i = $k; $i >= 0; $i--) {
  34. my $binProb = binomial_probability($n,$i,$p);
  35. $sum += $binProb;
  36. }
  37. return ($sum);
  38. }
  39. sub binomial_probability {
  40. my ($n_observations, $k_successes, $p_probability) = @_;
  41. my ($n, $k, $p) = ($n_observations, $k_successes, $p_probability);
  42. ### Given B(n,p), find P(X=k)
  43. my $binomial_prob = binomial_coefficient($n,$k) * ($p**$k) * (1-$p)**($n-$k);
  44. return ($binomial_prob);
  45. }
  46. sub binomial_coefficient {
  47. my ($n_things, $k_at_a_time) = @_;
  48. my $number_of_k_arrangements = (factorial($n_things)) / ( factorial($k_at_a_time) * factorial($n_things-$k_at_a_time) );
  49. return ($number_of_k_arrangements);
  50. }
  51. sub factorial {
  52. my $x = shift;
  53. $x = int($x);
  54. my $factorial = 1;
  55. while ($x > 1) {
  56. $factorial *= $x;
  57. $x--;
  58. }
  59. return ($factorial);
  60. }
  61. sub stDev {
  62. # standard deviation calculation
  63. my @nums = @_;
  64. @nums = sort {$a<=>$b} @nums;
  65. my $avg = avg(@nums);
  66. my $count_eles = scalar(@nums);
  67. ## sum up the sqr of diff from avg
  68. my $sum_avg_diffs_sqr = 0;
  69. foreach my $num (@nums) {
  70. my $diff = $num - $avg;
  71. my $sqr = $diff**2;
  72. $sum_avg_diffs_sqr += $sqr;
  73. }
  74. my $stdev = sqrt ($sum_avg_diffs_sqr/($count_eles-1));
  75. return ($stdev);
  76. }
  77. ####
  78. sub stdErr {
  79. my @vals = @_;
  80. my $stdev = &stDev(@vals);
  81. my $num_vals = scalar(@vals);
  82. my $stdErr = $stdev / sqrt($num_vals);
  83. return($stdErr);
  84. }
  85. sub median {
  86. my @nums = @_;
  87. @nums = sort {$a<=>$b} @nums;
  88. my $count = scalar (@nums);
  89. if ($count %2 == 0) {
  90. ## even number:
  91. my $half = $count / 2;
  92. return (avg ($nums[$half-1], $nums[$half]));
  93. }
  94. else {
  95. ## odd number. Return middle value
  96. my $middle_index = int($count/2);
  97. return ($nums[$middle_index]);
  98. }
  99. }
  100. sub avg {
  101. my @nums = @_;
  102. my $total = $#nums + 1;
  103. my $sum = 0;
  104. foreach my $num (@nums) {
  105. $sum += $num;
  106. }
  107. my $avg = $sum/$total;
  108. return ($avg);
  109. }
  110. sub CorrelationCoeff {
  111. my ($x_aref, $y_aref) = @_;
  112. my @x = @$x_aref;
  113. my @y = @$y_aref;
  114. my $total = $#x + 1;
  115. my $avg_x = avg(@x);
  116. my $avg_y = avg(@y);
  117. my $stdev_x = stDev(@x);
  118. my $stdev_y = stDev(@y);
  119. # sum part of equation
  120. my $summation = 0;
  121. for (my $i = 0; $i < $total; $i++) {
  122. my $x_val = $x[$i];
  123. my $y_val = $y[$i];
  124. my $x_part = ($x_val - $avg_x)/$stdev_x;
  125. my $y_part = ($y_val - $avg_y)/$stdev_y;
  126. $summation += ($x_part * $y_part);
  127. }
  128. my $cor = (1/($total-1)) * $summation;
  129. return ($cor);
  130. }
  131. ####
  132. sub geometric_mean {
  133. my @entries = @_;
  134. my $num_entries = scalar (@entries);
  135. unless ($num_entries) {
  136. return (undef);
  137. }
  138. ## All entries must be > 0
  139. my $logsum = 0;
  140. foreach my $entry (@entries) {
  141. unless ($entry > 0) {
  142. return (undef);
  143. }
  144. $logsum += log ($entry);
  145. }
  146. my $geo_mean = exp ( (1/$num_entries) * $logsum);
  147. return ($geo_mean);
  148. }
  149. ####
  150. sub min {
  151. my @vals = @_;
  152. @vals = sort {$a<=>$b} @vals;
  153. my $min_val = shift @vals;
  154. return ($min_val);
  155. }
  156. ####
  157. sub max {
  158. my @vals = @_;
  159. @vals = sort {$a<=>$b} @vals;
  160. my $max_val = pop @vals;
  161. return ($max_val);
  162. }
  163. ####
  164. sub sum {
  165. my @vals = @_;
  166. my $x = 0;
  167. foreach my $val (@vals) {
  168. $x += $val;
  169. }
  170. return ($x);
  171. }
  172. =Rcode
  173. > tukey.biweight
  174. function (x, c = 5, epsilon = 1e-04)
  175. {
  176. m <- median(x)
  177. s <- median(abs(x - m))
  178. u <- (x - m)/(c * s + epsilon)
  179. w <- rep(0, length(x))
  180. i <- abs(u) <= 1
  181. w[i] <- ((1 - u^2)^2)[i]
  182. t.bi <- sum(w * x)/sum(w)
  183. return(t.bi)
  184. }
  185. =cut
  186. ####
  187. sub tukey_biweight {
  188. my (@x) = @_;
  189. my $m = median(@x);
  190. my $s;
  191. {
  192. my @y;
  193. foreach my $val (@x) {
  194. my $t = abs($val - $m);
  195. push (@y, $t);
  196. }
  197. $s = median(@y);
  198. }
  199. my @u;
  200. {
  201. my $epsilon = 1e-4;
  202. my $c = 5;
  203. foreach my $val (@x) {
  204. my $t = $val - $m;
  205. $t /= ($c * $s + $epsilon);
  206. push (@u, $t);
  207. }
  208. }
  209. my @i;
  210. {
  211. foreach my $val (@u) {
  212. my $t = (abs($val) <= 1) ? 1:0;
  213. push (@i, $t);
  214. }
  215. }
  216. my @w;
  217. {
  218. foreach my $val (@u) {
  219. my $i = shift @i;
  220. my $t = ( (1 - $val**2) **2) * $i;
  221. push (@w, $t);
  222. }
  223. }
  224. my $bi;
  225. {
  226. my $sum_w = 0;
  227. for (my $i = 0; $i < @w; $i++) {
  228. my $w = $w[$i];
  229. my $x = $x[$i];
  230. $bi += $w * $x;
  231. $sum_w += $w;
  232. }
  233. $bi /= $sum_w;
  234. }
  235. return($bi);
  236. }
  237. 1; #EOM