LikelihoodRatioTest_fun.R 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. ## The following code tries to evaluate the performance of different likelihood ratio-based tests
  2. ## Yuanlong LIU
  3. ## 01-03-2018
  4. ############## model 1: keep the same structure ##############
  5. ## In this case, a tree structure should be provided
  6. ## Constraint: the top three levels have the same contact intensity parameter
  7. ## 01-03-2018
  8. p_likelihood_ratio <- function( A, head, mid, tail, num )
  9. {
  10. A_whole = A[head:tail, head:tail]
  11. A_a = A[head:mid, head:mid]
  12. A_b = A[(mid+1):tail, (mid+1):tail]
  13. A_ab = A[head:mid, (mid+1):tail]
  14. tri_Awhole = A_whole[upper.tri(A_whole, diag=FALSE)]
  15. tri_Aa = A_a[upper.tri(A_a, diag=FALSE)]
  16. tri_Ab = A_b[upper.tri(A_b, diag=FALSE)]
  17. # gamma_fit(tri_Awhole, num)
  18. inner_a = get_prob( tri_Aa )
  19. inner_b = get_prob( tri_Ab )
  20. inter_ab = get_prob( A_ab )
  21. ## negative binomial
  22. # inner_a = get_prob_ng( tri_Aa )
  23. # inner_b = get_prob_ng( tri_Ab )
  24. # inter_ab = get_prob_ng( A_ab )
  25. ## log likelihood of H1
  26. LH1 = inner_a + inner_b + inter_ab
  27. LH0 = get_prob( tri_Awhole )
  28. Lambda = -2*( LH0 - LH1 )
  29. # cat(Lambda, '\n')
  30. df_h0 = 1*1
  31. df_h1 = 3*1
  32. df = df_h1 - df_h0
  33. p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE)
  34. info = list( Lambda=Lambda, p=p )
  35. return(info)
  36. }
  37. p_likelihood_ratio_nb <- function( A, head, mid, tail )
  38. {
  39. A_whole = A[head:tail, head:tail]
  40. A_a = A[head:mid, head:mid]
  41. A_b = A[(mid+1):tail, (mid+1):tail]
  42. A_ab = A[head:mid, (mid+1):tail]
  43. tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)]
  44. tri_Aa = A_a[upper.tri(A_a, diag=TRUE)]
  45. tri_Ab = A_b[upper.tri(A_b, diag=TRUE)]
  46. ## negative binomial
  47. inner_a = get_prob_nb( tri_Aa )
  48. inner_b = get_prob_nb( tri_Ab )
  49. inter_ab = get_prob_nb( A_ab )
  50. ## log likelihood of H1
  51. LH1 = inner_a + inner_b + inter_ab
  52. LH0 = get_prob_nb( tri_Awhole )
  53. Lambda = -2*( LH0 - LH1 )
  54. # cat(Lambda, '\n')
  55. n_parameters = 2
  56. df_h0 = 1*n_parameters
  57. df_h1 = 3*n_parameters
  58. df = df_h1 - df_h0
  59. p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE)
  60. info = list( Lambda=Lambda, p=p )
  61. return(info)
  62. }
  63. p_likelihood_ratio_norm <- function( A, head, mid, tail )
  64. {
  65. A_whole = A[head:tail, head:tail]
  66. A_a = A[head:mid, head:mid]
  67. A_b = A[(mid+1):tail, (mid+1):tail]
  68. A_ab = A[head:mid, (mid+1):tail]
  69. tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)]
  70. tri_Aa = A_a[upper.tri(A_a, diag=TRUE)]
  71. tri_Ab = A_b[upper.tri(A_b, diag=TRUE)]
  72. ## norm
  73. inner_a = likelihood_norm( tri_Aa )
  74. inner_b = likelihood_norm( tri_Ab )
  75. inter_ab = likelihood_norm( A_ab )
  76. ## log likelihood of H1
  77. LH1 = inner_a + inner_b + inter_ab
  78. LH0 = likelihood_norm( tri_Awhole )
  79. Lambda = -2*( LH0 - LH1 )
  80. # cat(Lambda, '\n')
  81. n_parameters = 2
  82. df_h0 = 1*n_parameters
  83. df_h1 = 3*n_parameters
  84. df = df_h1 - df_h0
  85. p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE)
  86. info = list( Lambda=Lambda, p=p )
  87. return(info)
  88. }
  89. p_likelihood_ratio_gamma <- function( A, head, mid, tail, n_parameters, imputation )
  90. {
  91. A_whole = A[head:tail, head:tail]
  92. A_a = A[head:mid, head:mid]
  93. A_b = A[(mid+1):tail, (mid+1):tail]
  94. A_ab = A[head:mid, (mid+1):tail]
  95. tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)]
  96. ## added 25-03-2018. If no zero values, no imputation
  97. if( length(tri_Awhole==0) == 0 ) imputation = FALSE
  98. tri_Aa = A_a[upper.tri(A_a, diag=TRUE)]
  99. tri_Ab = A_b[upper.tri(A_b, diag=TRUE)]
  100. ## norm
  101. inner_a = likelihood_gamma_mme( tri_Aa )
  102. inner_b = likelihood_gamma_mme( tri_Ab )
  103. inter_ab = likelihood_gamma_mme( A_ab )
  104. whole = likelihood_gamma_mme( tri_Awhole )
  105. if( imputation ) ## zero values are imputed by the estimated distribution based on non random values
  106. {
  107. inner_a = likelihood_gamma_mme( tri_Aa[tri_Aa!=0] )/length( tri_Aa[tri_Aa!=0] )*length( tri_Aa )
  108. inner_b = likelihood_gamma_mme( tri_Ab[tri_Ab!=0] )/length( tri_Ab[tri_Ab!=0] )*length( tri_Ab )
  109. inter_ab = likelihood_gamma_mme( A_ab )/length( A_ab[A_ab!=0] )*length( A_ab )
  110. whole = likelihood_gamma_mme( tri_Awhole[tri_Awhole!=0] )/length( tri_Awhole[tri_Awhole!=0] )*length( tri_Awhole )
  111. n_parameters = n_parameters - 1 ## the mixture parameter of 0 is not taken into account
  112. }
  113. ## log likelihood of H1
  114. LH1 = inner_a + inner_b + inter_ab
  115. LH0 = whole
  116. Lambda = -2*( LH0 - LH1 )
  117. # cat(Lambda, '\n')
  118. df_h0 = 1*n_parameters
  119. df_h1 = 3*n_parameters
  120. df = df_h1 - df_h0
  121. p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE)
  122. info = list( Lambda=Lambda, p=p )
  123. return(info)
  124. }
  125. lognormal_mean_test <- function( cA, head, mid, tail )
  126. {
  127. A_whole = cA[head:tail, head:tail]
  128. A_a = cA[head:mid, head:mid]
  129. A_b = cA[(mid+1):tail, (mid+1):tail]
  130. A_ab = cA[head:mid, (mid+1):tail]
  131. tri_Aa = A_a[upper.tri(A_a, diag=TRUE)]
  132. tri_Ab = A_b[upper.tri(A_b, diag=TRUE)]
  133. tri_Aa_vec = as.vector( tri_Aa )
  134. tri_Ab_vec = as.vector( tri_Ab )
  135. A_ab_vec = as.vector( A_ab )
  136. tri_Aa_vec_p = tri_Aa_vec[tri_Aa_vec!=0]
  137. tri_Ab_vec_p = tri_Ab_vec[tri_Ab_vec!=0]
  138. A_ab_vec_p = A_ab_vec[A_ab_vec!=0]
  139. p_Aa = p_lognormal_mean( tri_Aa_vec_p, A_ab_vec_p )
  140. p_Ab = p_lognormal_mean( tri_Ab_vec_p, A_ab_vec_p )
  141. return( list(p_Aa=p_Aa, p_Ab=p_Ab) )
  142. }
  143. ## https://www.jstor.org/stable/2533570
  144. ## google: Methods for Comparing the Means of Two Independent Log-Normal Samples
  145. ## 03-07-2018
  146. p_lognormal_mean <- function( vec_aa, vec_ab )
  147. {
  148. if( all(vec_aa==0) | all(vec_ab==0) ) return(0)
  149. n_aa = length(vec_aa)
  150. n_ab = length(vec_ab)
  151. fited_info_aa = MASS::fitdistr(vec_aa, 'lognormal') ## intra
  152. mu_aa = fited_info_aa$estimate[1]
  153. sd_aa = fited_info_aa$estimate[2]
  154. s2_aa = sd_aa^2
  155. fited_info_ab = MASS::fitdistr(vec_ab, 'lognormal') ## inter
  156. mu_ab = fited_info_ab$estimate[1]
  157. sd_ab = fited_info_ab$estimate[2]
  158. s2_ab = sd_ab^2
  159. z_score = ( (mu_aa - mu_ab) + (1/2)*(s2_aa - s2_ab) ) / sqrt( s2_aa/n_aa + s2_ab/n_ab + (1/2)*( s2_aa^2/(n_aa-1) + s2_ab^2/(n_ab-1) ) )
  160. p = pnorm( z_score, lower.tail=FALSE )
  161. return(p)
  162. }
  163. get_corner_xy <- function(A_whole)
  164. {
  165. n = nrow(A_whole)
  166. corner_size = floor(n/2)
  167. A_corner = A_whole[1:corner_size, (n - corner_size + 1 ):n]
  168. expected_high = A_corner[upper.tri(A_corner, diag=FALSE)]
  169. expected_low = A_corner[lower.tri(A_corner, diag=TRUE)]
  170. return(list(x=expected_low, y=expected_high))
  171. }
  172. get_half_mat_values <- function(mat)
  173. {
  174. n1 = nrow(mat)
  175. n2 = ncol(mat)
  176. delta = n1/n2
  177. rows = lapply( 1:n2, function(x) (1+ceiling(x*delta)):n1 )
  178. rows = rows[ sapply(rows, function(v) max(v) <= n1) ]
  179. flag = which(diff(sapply(rows, length)) > 0)
  180. if(length(flag)>0) rows = rows[ 1:min(flag) ]
  181. row_col_indices = cbind( unlist(rows), rep(1:length(rows), sapply(rows, length)))
  182. x = mat[row_col_indices]
  183. mat[row_col_indices] = NA
  184. y = na.omit(as.vector(mat))
  185. return(list(x=x, y=y))
  186. }
  187. get_half_mat_values_v2 <- function(mat)
  188. {
  189. ## https://stackoverflow.com/questions/52990525/get-upper-triangular-matrix-from-nonsymmetric-matrix/52991508#52991508
  190. y = mat[nrow(mat) * (2 * col(mat) - 1) / (2 * ncol(mat)) - row(mat) > -1/2]
  191. x = mat[nrow(mat) * (2 * col(mat) - 1) / (2 * ncol(mat)) - row(mat) < -1/2]
  192. return(list(x=x, y=y))
  193. }
  194. p_wilcox_test_nested <- function( A, head, mid, tail, alternative, is_CD )
  195. {
  196. test_1 = p_wilcox_test( A, head, mid, tail, alternative, is_CD, only_corner=FALSE ) #: coner + inter
  197. if( (tail - head <= 4) | (test_1$p > 0.05) ) ## when > 0.05 or it is already small, do not cosider it as nested
  198. {
  199. info = list( Lambda=NULL, p=0.555555, mean_diff=0 )
  200. return(info)
  201. }
  202. ## try_error happens when tad to test is too small. THEREFORE, ASSIGN P=0 TO THE TAD
  203. test_left_tad = try(p_wilcox_test( A, head, mid=ceiling((head+mid)/2), mid, alternative, is_CD=FALSE, only_corner=TRUE )) #: coner + inter
  204. if( class(test_left_tad)=="try-error" ) test_left_tad = list(p=0)
  205. test_right_tad = try(p_wilcox_test( A, mid+1, mid=ceiling((mid+1+tail)/2), tail, alternative, is_CD=FALSE, only_corner=TRUE )) #: coner + inter
  206. if( class(test_right_tad)=="try-error" ) test_right_tad = list(p=0)
  207. info = list( Lambda=NULL, p=max( test_1$p, min(test_left_tad$p, test_right_tad$p) ), mean_diff=0 )
  208. return(info)
  209. }
  210. p_wilcox_test = function( A, head, mid, tail, alternative, is_CD=FALSE, only_corner=FALSE ) ## only_corner tests if a domain is a TAD (no nesting)
  211. {
  212. A_whole = A[head:tail, head:tail]
  213. A_a = A[head:mid, head:mid]
  214. A_b = A[(mid+1):tail, (mid+1):tail]
  215. A_ab = A[head:mid, (mid+1):tail]
  216. tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)] ## need to check whether diag should be false or true
  217. tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] ## need to check whether diag should be false or true
  218. tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] ## need to check whether diag should be false or true
  219. tri_Aa_vec = as.vector( tri_Aa )
  220. tri_Ab_vec = as.vector( tri_Ab )
  221. A_ab_vec = as.vector( A_ab )
  222. corner_mat_info = get_half_mat_values_v2(A_ab)
  223. A_ab_corner = as.vector(corner_mat_info$y)
  224. A_ab_ncorner = corner_mat_info$x
  225. p_corner = wilcox.test(x=A_ab_corner, y=A_ab_ncorner, alternative='greater', exact=F)
  226. p_inter = wilcox.test(x=c(tri_Ab_vec, tri_Aa_vec), y=A_ab_ncorner, alternative='greater', exact=F)
  227. # p_inter = wilcox.test(x=c(tri_Ab_vec, tri_Aa_vec), y=c(A_ab_ncorner, A_ab_corner), alternative='greater', exact=F)
  228. if(is_CD==FALSE) p = max(p_inter$p.value, p_corner$p.value) ## if the tested part is the CD
  229. if(is_CD==TRUE) p = p_inter$p.value ## if the tested part is the CD
  230. # if(only_corner==TRUE) p = p_corner$p.value
  231. mean_diff_inter = mean(A_ab_ncorner) - mean(c(tri_Ab_vec, tri_Aa_vec)) ## negative is good
  232. mean_diff_corner = mean(A_ab_ncorner) - mean(A_ab_corner) ## negative is good
  233. if(is_CD==FALSE) mean_diff = min(mean_diff_corner, mean_diff_inter) ## if the tested part is the CD
  234. if(is_CD==TRUE) mean_diff = mean_diff_inter
  235. if(min(length(tri_Ab_vec), length(tri_Ab_vec)) < 10) mean_diff = 100 ## when one of the two twins is too small. 10: dim(4,4)
  236. # mean_diff = mean(A_ab_corner) - mean(A_ab_ncorner)
  237. # p_test_Aa = wilcox.test(x=A_ab_vec, y=tri_Aa_vec, alternative="less", exact=F)
  238. # p_test_Ab = wilcox.test(x=A_ab_vec, y=tri_Ab_vec, alternative="less", exact=F)
  239. # p = wilcox.test(x=c(tri_Ab_vec, tri_Aa_vec), y=A_ab_vec, alternative=alternative, exact=F)
  240. # xy = get_corner_xy(A_whole)
  241. # p = wilcox.test(x=xy$x, y=xy$y, alternative=alternative, exact=F)
  242. # p = max(p_test_Aab$p.value, p_test_Ab$p.value)
  243. info = list( Lambda=NULL, p=p, mean_diff=mean_diff)
  244. return(info)
  245. }
  246. p_likelihood_ratio_lnorm <- function( A, head, mid, tail, n_parameters, imputation, imputation_num=1E2 )
  247. {
  248. likelihood_fun = likelihood_lnorm_mle
  249. A_whole = A[head:tail, head:tail]
  250. A_a = A[head:mid, head:mid]
  251. A_b = A[(mid+1):tail, (mid+1):tail]
  252. A_ab = A[head:mid, (mid+1):tail]
  253. tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)]
  254. ## added 25-03-2018. If no zero values, no imputation
  255. no_zero_flag = 0
  256. if( sum(tri_Awhole==0) == 0 ) { no_zero_flag = 1; imputation = FALSE }
  257. tri_Aa = A_a[upper.tri(A_a, diag=TRUE)]
  258. tri_Ab = A_b[upper.tri(A_b, diag=TRUE)]
  259. tri_Aa_vec = as.vector( tri_Aa )
  260. tri_Ab_vec = as.vector( tri_Ab )
  261. A_ab_vec = as.vector( A_ab )
  262. mean_a = mean( tri_Aa_vec[tri_Aa_vec!=0] )
  263. mean_b = mean( tri_Ab_vec[tri_Ab_vec!=0] )
  264. mean_ab = mean( A_ab_vec[A_ab_vec!=0] )
  265. mean_diff = mean_ab - min(mean_a, mean_b)
  266. if( (all(tri_Aa_vec==0)) | (all(tri_Ab_vec==0)) | (all(A_ab_vec==0)) )
  267. {
  268. info = list( Lambda=NA, p=0, mean_diff=mean_diff )
  269. return(info)
  270. }
  271. ## lnorm
  272. if(!imputation)
  273. {
  274. inner_a = likelihood_fun( tri_Aa )
  275. inner_b = likelihood_fun( tri_Ab )
  276. inter_ab = likelihood_fun( A_ab )
  277. whole = likelihood_fun( tri_Awhole )
  278. ## log likelihood of H1
  279. LH1 = inner_a + inner_b + inter_ab
  280. LH0 = whole
  281. Lambda = -2*( LH0 - LH1 )
  282. if( no_zero_flag ) n_parameters = n_parameters - 1 ## no alpha parameter
  283. df_h0 = 1*n_parameters ##(theta_1 = theta_2 = theta_3)
  284. df_h1 = 3*n_parameters ##(theta_1, theta_2, theta_3)
  285. df = df_h1 - df_h0
  286. p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE)
  287. info = list( Lambda=Lambda, p=p, mean_diff=mean_diff )
  288. return(info)
  289. }
  290. if( imputation ) ## zero values are imputed by the estimated distribution based on non random values
  291. {
  292. vec_list = list( tri_Aa=tri_Aa, tri_Ab=tri_Ab, A_ab=A_ab )
  293. imputations = imputation_list( vec_list, imputation_num )
  294. inner_as = apply(imputations$tri_Aa, 1, likelihood_fun)
  295. inner_bs = apply(imputations$tri_Ab, 1, likelihood_fun)
  296. inter_abs = apply(imputations$A_ab, 1, likelihood_fun)
  297. wholes = apply(do.call( cbind, imputations ), 1, likelihood_fun)
  298. LH1s = inner_as + inner_bs + inter_abs
  299. LH0s = wholes
  300. Lambdas = -2*( LH0s - LH1s )
  301. Lambda = mean( Lambdas )
  302. n_parameters = n_parameters - 1 ## the mixture parameter is not taken into account
  303. # cat(Lambda, '\n')
  304. df_h0 = 1*n_parameters
  305. df_h1 = 3*n_parameters
  306. df = df_h1 - df_h0
  307. p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE)
  308. info = list( Lambda=Lambda, p=p, mean_diff=mean_diff )
  309. return(info)
  310. # if( imputation ) ## zero values are imputed by the estimated distribution based on non random values
  311. # {
  312. # inner_a = likelihood_lnorm( tri_Aa[tri_Aa!=0] )/length( tri_Aa[tri_Aa!=0] )*length( tri_Aa )
  313. # inner_b = likelihood_lnorm( tri_Ab[tri_Ab!=0] )/length( tri_Ab[tri_Ab!=0] )*length( tri_Ab )
  314. # inter_ab = likelihood_lnorm( A_ab )/length( A_ab[A_ab!=0] )*length( A_ab )
  315. # whole = likelihood_lnorm( tri_Awhole[tri_Awhole!=0] )/length( tri_Awhole[tri_Awhole!=0] )*length( tri_Awhole )
  316. # n_parameters = n_parameters - 1 ## the mixture parameter of 0 is not taken into account
  317. # }
  318. }
  319. }
  320. imputation_list <- function( vec_list, imputation_num )
  321. {
  322. imputations = lapply( vec_list, function(v)
  323. {
  324. if(sum(v!=0)==1) {final_vec = matrix( v[v!=0], imputation_num, length(v) ); return(final_vec)}
  325. fit = fit_lnorm(v)
  326. set.seed(1)
  327. ## THERE WILL BE ERROR IF IS.NA SDLOG
  328. if(!is.na(fit['sdlog'])) imputation_vec = matrix(rlnorm( sum(v==0)*imputation_num, fit['meanlog'], fit['sdlog'] ), nrow=imputation_num)
  329. if(is.na(fit['sdlog'])) stop("In function imputation_list, sdlog=NA encountered")
  330. ori_vec = t(replicate(imputation_num, v[v!=0]))
  331. final_vec = cbind( ori_vec, imputation_vec )
  332. } )
  333. return( imputations )
  334. }
  335. fit_lnorm <- function(vec)
  336. {
  337. vec = vec[vec!=0]
  338. fit = MASS::fitdistr(vec, 'lognormal')$estimate
  339. return(fit)
  340. }