general_functions.R 55 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270
  1. get_ccA_atanh = function(contact_mat_file, compress_size, bin_size)
  2. {
  3. contact_mat_raw = data.table::fread(contact_mat_file)
  4. contact_mat_raw = subset(contact_mat_raw, !is.na(V3))
  5. contact_mat_raw[,1] = contact_mat_raw[,1]/bin_size
  6. contact_mat_raw[,2] = contact_mat_raw[,2]/bin_size
  7. contact_mat = contact_mat_raw
  8. colnames(contact_mat) = c('pos_1', 'pos_2', 'val')
  9. if(!all(contact_mat[[2]] >= contact_mat[[1]])) stop('\nYour provided matrix does not represent an upper triangular matrix!\n\n')
  10. n_bins_whole = max(max(contact_mat[[1]]), max(contact_mat[[2]])) + 1 ## should +1 because contact_mat index starts from 0 (bin 0 represents: 0-10E3, checked by looking at the juicebox map, 2018-11-19)
  11. contact_mat_sparse = Matrix(0, nrow=n_bins_whole, ncol=n_bins_whole)
  12. contact_mat_sparse[cbind(contact_mat[[1]]+1, contact_mat[[2]]+1)] <- contact_mat[[3]]
  13. rownames(contact_mat_sparse) = colnames(contact_mat_sparse) = as.character( 1:nrow(contact_mat_sparse) )
  14. contact_mat_sparse = Matrix::forceSymmetric(contact_mat_sparse, uplo='U')
  15. # if(bin_size!=bin_size_initial) contact_mat_sparse = mat_10to40kb( contact_mat_sparse, bin_size, bin_size_initial )
  16. A = remove_blank_cols(contact_mat_sparse, sparse=TRUE, ratio=zero_ratio) ## has the same rows/cols as A
  17. if(nrow(A) < 100) A = remove_blank_cols(contact_mat_sparse, sparse=TRUE, ratio=0) ## when all are dense
  18. while(min(apply(A, 1, sd))==0) ## sometimes after removing the cols / rows, the remained part will all be 0
  19. {
  20. A = remove_blank_cols(A, sparse=TRUE, ratio=1E-7) ## has the same rows/cols as A
  21. if(nrow(A) < 1) stop('ERROR IN GENERATING MEANINGFUL A at the data generating step')
  22. }
  23. # #########################################################
  24. n_bins_A = nrow(A) - nrow(A)%%compress_size
  25. A_2_compress = A[, 1:n_bins_A]
  26. bin_names = rownames(A)
  27. A_compressed = compress_mat_fast( as.matrix(A_2_compress), compress_size=compress_size )
  28. colnames(A_compressed) = bin_names
  29. rm(A_2_compress); gc()
  30. A_compressed_sparse = A_compressed
  31. A_compressed = as.matrix(A_compressed)
  32. A_compressed_log = log2(A_compressed + 1)
  33. # #########################################################
  34. cat('Compute correlation matrix\n')
  35. cA_compressed_log = fast_cor(A_compressed_log)
  36. ccA_compressed_log = fast_cor(cA_compressed_log)
  37. cat('Finished computing correlation matrix\n')
  38. # #########################################################
  39. # # ccA_compressed_atanh = atanh(ccA_compressed - 1E-7)
  40. ccA_atanh = atanh(ccA_compressed_log / (1+1E-7))
  41. A = as.matrix(A)
  42. rm(A_compressed, A_compressed_sparse, cA_compressed_log, A_compressed_log); gc()
  43. # #########################################################
  44. cat('Finished data generation\n')
  45. return(list(A=A, ccA_atanh=ccA_atanh))
  46. }
  47. get_detailed_full_tree <- function(res_info)
  48. {
  49. # attach(res_info)
  50. for( i in 1:nrow(res_info$segmentss) )
  51. {
  52. index_left2adj = which(igraph::V(res_info$hi_tree)$left_rescaled == res_info$segmentss_nadj[i,1])
  53. index_right2adj = which(igraph::V(res_info$hi_tree)$right_rescaled == res_info$segmentss_nadj[i,2])
  54. igraph::V(res_info$hi_tree)[index_left2adj]$left_rescaled = res_info$segmentss[i,1]
  55. igraph::V(res_info$hi_tree)[index_right2adj]$right_rescaled = res_info$segmentss[i,2]
  56. igraph::V(res_info$hi_tree)$width_rescaled = igraph::V(res_info$hi_tree)$right_rescaled - igraph::V(res_info$hi_tree)$left_rescaled + 1
  57. igraph::V(res_info$hi_tree)$name = paste('(',igraph::V(res_info$hi_tree)$left_rescaled, ',', igraph::V(res_info$hi_tree)$right_rescaled, ')', sep='')
  58. }
  59. ## xenocfraf:
  60. # res_inner = res_inner[sapply(res_inner, length) > 0]
  61. ## xenocfraf:
  62. # res_inner = res_inner[sapply(res_inner, length) > 0]
  63. branches = lapply( res_info$res_inner, get_tree_v0 )
  64. for( i in 1:length(branches) ) branches[[i]] = update_branch_name(branches[[i]], root_start=res_info$segmentss[i,1])
  65. trunk = res_info$hi_tree
  66. full_tree = xenocraft( trunk, branches )
  67. names_tmp = do.call(rbind, strsplit(igraph::V(full_tree)$name, ','))
  68. igraph::V(full_tree)$left = substring(names_tmp[,1], 2)
  69. igraph::V(full_tree)$right = substring(names_tmp[,2], 1, nchar(names_tmp[,2])-1)
  70. if(!is_binary_tree(full_tree)) stop("Trunk + branches do not produce a binary tree")
  71. detailed_full_tree = tree_germination(full_tree)
  72. # detach(res_info)
  73. return(detailed_full_tree)
  74. }
  75. insulation_score_fun = function(A, size=3)
  76. {
  77. insulation_score_fun_helper = function(bin_start)
  78. {
  79. bin_mid = bin_start + size -1
  80. bin_end = bin_start + 2*size - 1
  81. up = sum(A[bin_start:bin_mid, bin_start:bin_mid])
  82. down = sum(A[(bin_mid+1):bin_end, (bin_mid+1):bin_end])
  83. inter = 2*sum(A[bin_start:bin_mid, (bin_mid+1):bin_end])
  84. insulation = 1 - inter / (up + down)
  85. return(insulation)
  86. }
  87. bin_starts = 1:(nrow(A) - 2*size + 1)
  88. insulations = sapply(bin_starts, function(bin_start) insulation_score_fun_helper(bin_start))
  89. names(insulations) = rownames(A)[size:(nrow(A) - size)]
  90. return(insulations)
  91. }
  92. ## generating the compartment_segs based on A and initial_clusters
  93. generate_compartment_segs <- function( initial_clusters )
  94. {
  95. bins_seq = unname(unlist(initial_clusters))
  96. if(!all(diff(bins_seq)==1)) stop('check generate_compartment_segs in generate_compartment_segs.R')
  97. if(bins_seq[1]!=1) stop('check generate_compartment_segs in generate_compartment_segs.R')
  98. compartment_segs = do.call(rbind, lapply(initial_clusters, function(v) v[c(1, length(v))]))
  99. compartment_segs = data.frame( start_pos=compartment_segs[,1], end_pos=compartment_segs[,2] )
  100. return( compartment_segs )
  101. }
  102. ## This function gets the nodes at level k
  103. ## Yuanlong LIU
  104. ## 02_07_2018
  105. get_level_k_nodes <- function(tree, k)
  106. {
  107. nodes = igraph::ego(tree, order=k, nodes=1, mode = "out", mindist = 1)[[1]]
  108. return(nodes)
  109. }
  110. ## This function gets the TADs at level k
  111. ## Yuanlong LIU
  112. ## 18_05_2018
  113. get_level_k_TADs <- function(branches, k)
  114. {
  115. tad_sizes_ind = lapply( branches, function(branch)
  116. {
  117. branch_level_k = igraph::induced.subgraph( branch, igraph::V(branch)$depth <= k )
  118. tad_size_ind = get_leaves(branch_level_k, 'igraph')$width
  119. })
  120. tad_sizes = unlist(tad_sizes_ind)
  121. end_pos = cumsum(tad_sizes)
  122. start_pos = c(1, 1 + end_pos[-length(end_pos)])
  123. tads = data.frame(start_pos=start_pos, end_pos=end_pos)
  124. return( tads )
  125. }
  126. get_least_residue_matrix <- function(pA_sym, max_nbins, allowed_shifts)
  127. {
  128. nbins = nrow( pA_sym )
  129. max_nbins_bp = max_nbins
  130. # remainders = nbins %% (max_nbins + (-10:10))
  131. remainders = nbins %% (max_nbins + allowed_shifts)
  132. max_nbins = (max_nbins + allowed_shifts)[ which.min(remainders) ]
  133. n2one = floor(nbins/max_nbins)
  134. remainder = nbins %% max_nbins
  135. if(max_nbins!=max_nbins_bp) warning('Value of max_nbins has been adjusted from ', max_nbins_bp, ' to ', max_nbins, '. This results in ', remainder, ' rows/columns excluded for the analysis.')
  136. if(remainder!=0) A_final = pA_sym[-(tail(1:nbins, remainder)), -(tail(1:nbins, remainder))]
  137. if(remainder==0) A_final = pA_sym
  138. res = list(A_final=A_final, n2one=n2one, max_nbins_new=max_nbins)
  139. }
  140. ## merge clusters based on their corr_mat
  141. ## Yuanlong LIU
  142. ## 01-07-2018
  143. sim.fun <- function( corr_mat, mod1, mod2, method='mean' )
  144. {
  145. if(method=='mean') return(mean(corr_mat[mod1, mod2]))
  146. if(method=='median') return(median(corr_mat[mod1, mod2]))
  147. if(method=='max') return(max(corr_mat[mod1, mod2]))
  148. }
  149. ############################################################
  150. ## this function removes all-zero columns / rows
  151. rm_zeros <- function(A) ## mat is a symmetric matrix
  152. {
  153. zero_indices = apply( A, 2, function(v) all(v==0) )
  154. if( all(zero_indices==0) ) return(A)
  155. A = A[!zero_indices, !zero_indices]
  156. return(A)
  157. }
  158. my_dist <- function(m) {mtm <- Matrix::tcrossprod(m); sq <- rowSums(m*m); res = suppressWarnings(sqrt(outer(sq,sq,"+") - 2*mtm)); res[is.nan(res)]=0; diag(res)=0; return(res)}
  159. MoC = function(P, Q)
  160. {
  161. len_p = length(P)
  162. len_q = length(Q)
  163. if((len_p==len_q) & (len_p==1)) return(1)
  164. grids = expand.grid(1:len_p, 1:len_q)
  165. vecs = apply( grids, 1, function(x) {u=x[1]; v=x[2]; length(intersect( P[[u]], Q[[v]] ))^2/length(P[[u]])/length(Q[[v]])} )
  166. res = 1/( sqrt(len_p*len_q) - 1)*(sum(vecs) - 1)
  167. return(res)
  168. }
  169. get_stair_vecs <- function(mat, from, to)
  170. {
  171. n = nrow(mat)
  172. solve <- function(mid) c(mat[1:mid,(mid+1):n])
  173. lapply(from:to, solve)
  174. }
  175. correct_A_fast_divide_by_mean <- function(A, remove_zero=TRUE, divide_or_substract='divide', mean_or_median='mean')
  176. {
  177. f = function(i, d) d*(d-1)/2+1+(2*d+i)*(i-1)/2
  178. n = nrow(A)
  179. upper_tri_values = A[upper.tri(A, diag=TRUE)]
  180. means_mat = indices = A*0
  181. indices[upper.tri(indices, diag=TRUE)] = 1:((n+1)*n/2)
  182. diag_values = orders = rep(list(list()), n)
  183. for(d in seq_along(orders)) orders[[d]] = f(i=seq(1, n-d+1), d=d)
  184. for(d in seq_along(orders)) diag_values[[d]] = upper_tri_values[f(i=seq(1, n-d+1), d=d)]
  185. if(mean_or_median=='mean'){
  186. if(remove_zero) means = unlist(lapply( diag_values, function(v) {m=mean(v[v!=0]); v=v*0+m; v[is.na(v)]=0; return(v) } ))
  187. if(!remove_zero) means = unlist(lapply( diag_values, function(v) {m=mean(v); v=v*0+m; return(v) } ))
  188. }
  189. if(mean_or_median=='median'){
  190. if(remove_zero) means = unlist(lapply( diag_values, function(v) {m=median(v[v!=0]); v=v*0+m; v[is.na(v)]=0; return(v) } ))
  191. if(!remove_zero) means = unlist(lapply( diag_values, function(v) {m=median(v); v=v*0+m; return(v) } ))
  192. }
  193. means[means==0] = 1
  194. means_mat[upper.tri(means_mat, diag=TRUE)] = means[order(unlist(orders))]
  195. if(divide_or_substract=='divide') A_corrected = A / means_mat
  196. if(divide_or_substract=='substract') A_corrected = A - means_mat
  197. A_corrected = as.matrix( Matrix::forceSymmetric(A_corrected) )
  198. rownames( A_corrected ) = colnames( A_corrected ) = rownames( A )
  199. return( A_corrected )
  200. }
  201. correct_A_fast_equal_mean <- function(A, remove_zero=TRUE, divide_or_substract='divide', mean_or_median='mean')
  202. {
  203. ## may also try log normal, since the off-diagnal values follow well a log-normal distribution
  204. ## commented by Yuanlong LIU, 2018-07-26
  205. f = function(i, d) d*(d-1)/2+1+(2*d+i)*(i-1)/2
  206. n = nrow(A)
  207. upper_tri_values = A[upper.tri(A, diag=TRUE)]
  208. means_mat = indices = A*0
  209. indices[upper.tri(indices, diag=TRUE)] = 1:((n+1)*n/2)
  210. diag_values = orders = rep(list(list()), n)
  211. for(d in seq_along(orders)) orders[[d]] = f(i=seq(1, n-d+1), d=d)
  212. for(d in seq_along(orders)) diag_values[[d]] = upper_tri_values[f(i=seq(1, n-d+1), d=d)]
  213. if(mean_or_median=='mean'){
  214. if(remove_zero) means = unlist(lapply( diag_values, function(v) {m=mean(v[v!=0]); v=v*0+m; v[is.na(v)]=0; return(v) } ))
  215. if(!remove_zero) means = unlist(lapply( diag_values, function(v) {m=mean(v); v=v*0+m; return(v) } ))
  216. }
  217. if(mean_or_median=='median'){
  218. if(remove_zero) means = unlist(lapply( diag_values, function(v) {m=median(v[v!=0]); v=v*0+m; v[is.na(v)]=0; return(v) } ))
  219. if(!remove_zero) means = unlist(lapply( diag_values, function(v) {m=median(v); v=v*0+m; return(v) } ))
  220. }
  221. means[means==0] = 1
  222. means_mat[upper.tri(means_mat, diag=TRUE)] = means[order(unlist(orders))]
  223. if(divide_or_substract=='divide') A_corrected = A / means_mat
  224. if(divide_or_substract=='substract') A_corrected = A - means_mat
  225. A_corrected = as.matrix( Matrix::forceSymmetric(A_corrected) )
  226. rownames( A_corrected ) = colnames( A_corrected ) = rownames( A )
  227. return( A_corrected )
  228. }
  229. creat_phylo_object <- function(tree)
  230. {
  231. creat_phylo_object_inner <- function(tree)
  232. {
  233. twins = igraph::ego(tree, order=1, node=1, mode='out', mindist=1)[[1]]
  234. branches = igraph::decompose(tree - 1)
  235. left_branch = branches[[1]]
  236. right_branch = branches[[2]]
  237. left_branch_flag = igraph::vcount( left_branch ) == 1
  238. right_branch_flag = igraph::vcount( right_branch ) == 1
  239. short_walk_dist = 1
  240. igraph::diameters = sapply(branches, igraph::diameter)
  241. long_walk_dist = abs(diff(igraph::diameters)) + 1
  242. if( left_branch_flag ) left_branch_newick = igraph::V(left_branch)$name
  243. if( !left_branch_flag ) left_branch_newick = creat_phylo_object_inner(left_branch)
  244. if( right_branch_flag ) right_branch_newick = igraph::V(right_branch)$name
  245. if( !right_branch_flag ) right_branch_newick = creat_phylo_object_inner(right_branch)
  246. if( igraph::diameters[1] > igraph::diameters[2] ) tree_newick = paste( '(', left_branch_newick, ':', short_walk_dist, ',', right_branch_newick, ':', long_walk_dist, ')', sep='' )
  247. if( igraph::diameters[1] <= igraph::diameters[2] ) tree_newick = paste( '(', left_branch_newick, ':', long_walk_dist, ',', right_branch_newick, ':', short_walk_dist, ')', sep='' )
  248. return( tree_newick )
  249. }
  250. tree_newick = creat_phylo_object_inner(tree)
  251. final_newick = paste(tree_newick, ';', sep='')
  252. tree = read.tree(text=final_newick)
  253. return( tree )
  254. }
  255. dist_max_min <- function( L_diff )
  256. {
  257. N = dim(L_diff)[1]
  258. L_diff_dist = list()
  259. for( dist in 1:(N-1) )
  260. {
  261. maxLs = numeric( N-dist )
  262. for( row in 1:(N-dist) )
  263. {
  264. i = row
  265. j = row + dist
  266. maxLs[i] = L_diff[i, j]
  267. }
  268. L_diff_dist[[dist]] = maxLs
  269. }
  270. return(L_diff_dist)
  271. }
  272. divide_into_groups <- function(A, n_group)
  273. {
  274. N = nrow(A)
  275. dists = 1:(N-1)
  276. counts = sapply(dists, function(v) n_cells2compute( A, v ))
  277. average = (ceiling(sum(counts) / n_group)) ## the average computational complexity
  278. groups <- cumsum(counts) %/% average + 1
  279. borders = c(0, which(diff(groups)!=0))
  280. binsizes = c()
  281. for( i in 1:(length(borders)-1) )
  282. {
  283. binsizes = c(binsizes, sum(counts[ (borders[i]+1):borders[i+1] ]))
  284. }
  285. info = list(borders=borders, binsizes=binsizes)
  286. return(info)
  287. }
  288. get_leaves <- function(tree, type='name')
  289. {
  290. leaf_indices = which(igraph::degree( tree, mode='out' ) == 0)
  291. if( type=='name' ) return(igraph::V(tree)[leaf_indices]$name)
  292. if( type=='igraph_node' ) return(igraph::V(tree)[leaf_indices])
  293. if( type=='igraph' ) return(igraph::V(tree)[leaf_indices])
  294. if( type=='index' ) return(leaf_indices)
  295. }
  296. get_segments <- function(hi_tree, binsize_thresh, return_segmentss_tree=FALSE)
  297. {
  298. leaf_widths = get_leaves(hi_tree, type='igraph')$width_rescaled
  299. if( binsize_thresh <= min(leaf_widths) )
  300. {
  301. warning( 'Your input binsize_thresh has a value: ', binsize_thresh, ', which is smaller than the minimum of the leaf width of hi_tree: ', min(leaf_widths) )
  302. }
  303. nodes2rm = numeric()
  304. for( i in 2:igraph::vcount(hi_tree) ) ## root node not take into account
  305. {
  306. node = igraph::V(hi_tree)[i]
  307. width = node$right_rescaled - node$left_rescaled + 1
  308. if(width <= binsize_thresh)
  309. {
  310. parent = igraph::ego(hi_tree, order=1, nodes=node, mode = "in", mindist = 1)[[1]]
  311. width_parent = parent$right_rescaled - parent$left_rescaled + 1
  312. if(width_parent <= binsize_thresh) nodes2rm = c(nodes2rm, i)
  313. }
  314. }
  315. trimmed_tree = hi_tree - nodes2rm
  316. leaves = get_leaves(trimmed_tree, type='igraph_node')
  317. segmentss = cbind(leaves$left_rescaled, leaves$right_rescaled)
  318. if(return_segmentss_tree==TRUE) return( trimmed_tree )
  319. return( segmentss )
  320. }
  321. get_tree_decoration = function( single_res_info, decoration=TRUE, distr, n_parameters, imputation_num=1E2 )
  322. {
  323. cA = single_res_info$cA
  324. # cat(dim(cA), '\n')
  325. if( !is.null(single_res_info$full_tree) ) tree=single_res_info$full_tree
  326. if( is.null(single_res_info$full_tree) ) tree = get_tree_v0(single_res_info)
  327. if(length(tree)==1) if( class(tree)=="character" & tree=='bad_tree') return( tree )
  328. ## more decoration
  329. if( decoration==FALSE ) return( tree )
  330. leaf_indices = get_leaves( tree, type='index' )
  331. igraph::V(tree)$mean_diff = 0
  332. zero_ratios = numeric()
  333. for(i in setdiff(1:igraph::vcount(tree), leaf_indices))
  334. {
  335. node = igraph::V(tree)[i]
  336. # igraph::V(tree)[i]$L = L[node$left, node$right]
  337. # A_union = cA[node$left:node$right, node$left:node$right]
  338. # igraph::V(tree)[i]$L_union = get_prob_nb( A_union[upper.tri(A_union, diag=TRUE)] )
  339. # the p-value of: H0: unioned model; H1: hierarchical model
  340. twins = igraph::ego(tree, 1, node, mode='out', mindist=1)[[1]]
  341. mid = sort(c(as.numeric(twins[1]$left), as.numeric(twins[1]$right), as.numeric(twins[2]$left), as.numeric(twins[2]$right)))[2]
  342. # if(distr=='nb') test_info = p_likelihood_ratio_nb( cA, head=node$left, mid=mid, tail=node$right )
  343. # if(distr=='norm') test_info = p_likelihood_ratio_norm( cA, head=node$left, mid=mid, tail=node$right )
  344. # if(distr=='gamma')
  345. # {
  346. # test_info = p_likelihood_ratio_gamma( cA, head=node$left, mid=mid, tail=node$right, n_parameters, imputation=FALSE )
  347. # test_info_imputation = p_likelihood_ratio_gamma( cA, head=node$left, mid=mid, tail=node$right, n_parameters, imputation=TRUE )
  348. # igraph::V(tree)[i]$imp_p = test_info_imputation$p
  349. # igraph::V(tree)[i]$imp_Lambda = test_info_imputation$Lambda
  350. # }
  351. if(distr=='lnorm')
  352. {
  353. if(!is.null(imputation_num))
  354. {
  355. test_info_imp = p_likelihood_ratio_lnorm( cA, head=as.numeric(node$left), mid=mid, tail=as.numeric(node$right), n_parameters=n_parameters, imputation= TRUE, imputation_num=imputation_num )
  356. igraph::V(tree)[i]$imp_p = test_info_imp$p
  357. igraph::V(tree)[i]$mean_diff = test_info_imp$mean_diff
  358. }
  359. test_info_nimp = p_likelihood_ratio_lnorm( cA, head=as.numeric(node$left), mid=mid, tail=as.numeric(node$right), n_parameters=n_parameters, imputation=FALSE, imputation_num=imputation_num )
  360. igraph::V(tree)[i]$nimp_p = test_info_nimp$p
  361. igraph::V(tree)[i]$mean_diff = test_info_imp$mean_diff
  362. # zero_ratios = c(zero_ratios, sum(cA[node$left:node$right, node$left:node$right]==0) / (length(node$left:node$right))^2)
  363. }
  364. if(distr=='wilcox')
  365. {
  366. if(i==1) test_info = p_wilcox_test( is_CD=TRUE, cA, head=as.numeric(node$left), mid=mid, tail=as.numeric(node$right), alternative='less' )
  367. if(i!=1) test_info = p_wilcox_test( is_CD=FALSE, cA, head=as.numeric(node$left), mid=mid, tail=as.numeric(node$right), alternative='less' )
  368. # if(i==1) test_info = p_wilcox_test_nested( is_CD=TRUE, cA, head=as.numeric(node$left), mid=mid, tail=as.numeric(node$right), alternative='less' )
  369. # if(i!=1) test_info = p_wilcox_test_nested( is_CD=FALSE, cA, head=as.numeric(node$left), mid=mid, tail=as.numeric(node$right), alternative='less' )
  370. igraph::V(tree)[i]$nimp_p = igraph::V(tree)[i]$imp_p = igraph::V(tree)[i]$wilcox_p = test_info$p
  371. igraph::V(tree)[i]$mean_diff = test_info$mean_diff
  372. # zero_ratios = c(zero_ratios, sum(cA[node$left:node$right, node$left:node$right]==0) / (length(node$left:node$right))^2)
  373. }
  374. test_info_mean_diff = lognormal_mean_test( cA, head=as.numeric(node$left), mid=mid, tail=as.numeric(node$right) )
  375. igraph::V(tree)[i]$aa_p = test_info_mean_diff$p_Aa
  376. igraph::V(tree)[i]$bb_p = test_info_mean_diff$p_Ab
  377. }
  378. igraph::V(tree)$L_diff = 0
  379. # igraph::V(tree)$L_diff = igraph::V(tree)$L - igraph::V(tree)$L_union
  380. if(!is.null(imputation_num)) igraph::V(tree)[leaf_indices]$imp_p = 1
  381. igraph::V(tree)[leaf_indices]$nimp_p = igraph::V(tree)[leaf_indices]$wilcox_p = igraph::V(tree)[leaf_indices]$imp_p = 1
  382. igraph::V(tree)[leaf_indices]$aa_p = 1
  383. igraph::V(tree)[leaf_indices]$bb_p = 1
  384. return(tree)
  385. }
  386. add_boundary_binsignal_to_decrated_branches <- function( decorated_branches, bin_signals_5_10 )
  387. {
  388. tree_widths = sapply(decorated_branches, function(v) igraph::V(v)[1]$width)
  389. if(sum(tree_widths) != length( bin_signals_5_10 )) stop('Check add_boundary_binsignal_to_decrated_branches')
  390. for(k in 1:length(decorated_branches))
  391. {
  392. tree = decorated_branches[[k]]
  393. leaf_indices = get_leaves( tree, type='index' )
  394. for(i in setdiff(1:igraph::vcount(tree), leaf_indices))
  395. {
  396. node = igraph::V(tree)[i]
  397. # igraph::V(tree)[i]$L = L[node$left, node$right]
  398. # A_union = cA[node$left:node$right, node$left:node$right]
  399. # igraph::V(tree)[i]$L_union = get_prob_nb( A_union[upper.tri(A_union, diag=TRUE)] )
  400. # the p-value of: H0: unioned model; H1: hierarchical model
  401. twins = igraph::ego(tree, 1, node, mode='out', mindist=1)[[1]]
  402. mid = sort(c(as.numeric(twins[1]$left), as.numeric(twins[1]$right), as.numeric(twins[2]$left), as.numeric(twins[2]$right)))[2]
  403. absolute_mid = mid + sum(tree_widths[(1:k)-1])
  404. igraph::V(tree)[i]$binsignal = bin_signals_5_10[absolute_mid]
  405. }
  406. igraph::V(tree)[leaf_indices]$binsignal = 0
  407. igraph::V(tree)[which(is.na(igraph::V(tree)$binsignal))]$binsignal = 0
  408. decorated_branches[[k]] = tree
  409. }
  410. return(decorated_branches)
  411. }
  412. get_tree_v0 <- function( single_res_info )
  413. {
  414. ancestors = single_res_info$ancestors
  415. L = single_res_info$L
  416. N = dim(ancestors)[1]
  417. current_node = ancestors[1, N]
  418. recursive <- function(current_node)
  419. {
  420. # cat(current_node, '\n')
  421. seqs = as.numeric(strsplit(current_node,'-')[[1]])
  422. # cat( current_node, '\n' )
  423. left_node = ancestors[seqs[1],seqs[2]]
  424. right_node = ancestors[seqs[3],seqs[4]]
  425. flag_left = (seqs[1]!=seqs[2]) & (left_node!="")
  426. flag_right = (seqs[3]!=seqs[4]) & (right_node!="")
  427. if(is.na(flag_left)) return( 'bad_tree' ) ## added: 30-04-2020
  428. left_leaf = paste( seqs[1],seqs[2], sep='-' )
  429. right_leaf = paste( seqs[3],seqs[4], sep='-' )
  430. if( flag_left ) left_tree = recursive(left_node)
  431. if( !flag_left ) left_tree = left_leaf
  432. if( flag_right ) right_tree = recursive(right_node)
  433. if( !flag_right ) right_tree = right_leaf
  434. if( !flag_left ) left_node = left_leaf
  435. if( !flag_right ) right_node = right_leaf
  436. tree_raw = igraph::graph.empty() + current_node + left_tree + right_tree
  437. tree = igraph::add_edges(tree_raw, c(current_node, left_node, current_node, right_node))
  438. return( tree )
  439. }
  440. tree = recursive(current_node)
  441. # stop('I stop here')
  442. # if(tree=='bad_tree') return('bad_tree') ## added: 30-04-2020
  443. if(class(tree)!='igraph') return('bad_tree')
  444. ## this part tries to decorate the tree by adding various node attributes
  445. igraph::V(tree)$left = sapply( igraph::V(tree)$name, function(v) {tmp=strsplit(v,'-')[[1]]; return(as.numeric(head(tmp,1)))} )
  446. igraph::V(tree)$right = sapply( igraph::V(tree)$name, function(v) {tmp=strsplit(v,'-')[[1]]; return(as.numeric(tail(tmp,1)))} )
  447. igraph::V(tree)$name = sapply( igraph::V(tree)$name, function(v) {tmp=strsplit(v,'-')[[1]]; paste( '(', head(tmp,1), ',', tail(tmp,1), ')', sep='' )} )
  448. igraph::V(tree)$width = igraph::V(tree)$right - igraph::V(tree)$left + 1
  449. return(tree)
  450. }
  451. is_binary_tree <-function(tree)
  452. {
  453. leaves = get_leaves( tree, 'index' )
  454. not_leaves = setdiff(1:igraph::vcount(tree), leaves)
  455. degrees = igraph::degree(tree, v = not_leaves, mode = 'out')
  456. if(!igraph::is.connected( tree )) return(FALSE)
  457. if(all(degrees==2)) return(TRUE)
  458. return(FALSE)
  459. }
  460. join_left_or_right <- function(tree)
  461. {
  462. leaves = get_leaves( tree, 'igraph' )
  463. left_or_right = numeric()
  464. left_or_right[1] = 0
  465. left_or_right[length(leaves) - 1] = 0
  466. for( i in 2:(length(leaves) - 1) )
  467. {
  468. leave = leaves[i]
  469. parent = igraph::ego(tree, nodes=leave, order=1, mindist=1, mode='in')[[1]]
  470. if( parent$left < leave$left ) left_or_right[i] = -1 ## left
  471. if( parent$right > leave$right ) left_or_right[i] = 1 ## right
  472. }
  473. return( left_or_right )
  474. }
  475. long_slices <- function(left_or_right, thresh=3)
  476. {
  477. dup_lens = rle( left_or_right )$lengths
  478. dup_values = rle( left_or_right )$values
  479. long_dups = which( dup_lens >= thresh )
  480. long_dup_lens = dup_lens[long_dups]
  481. long_dup_start_pos = cumsum( dup_lens )[long_dups-1] + 1 # 1: shift to the right by one
  482. long_dup_direction = left_or_right[ long_dup_start_pos ]
  483. indices_of_slice = as.vector(unlist(mapply(function(u,v) {seq(from=u, length=v, by=1)}, long_dup_start_pos, long_dup_lens)))
  484. dup_info = list( long_slices_lens=long_dup_lens, long_slices_start_pos=long_dup_start_pos, long_dup_direction=long_dup_direction, indices_of_slice=indices_of_slice )
  485. return(dup_info)
  486. }
  487. n_cells2compute <- function( A, dist, min_n_bins=1 )
  488. {
  489. N = nrow(A)
  490. if(min_n_bins==1)
  491. {
  492. count = dist*(1+dist)*(2+dist)*(N-dist)/6
  493. return(count)
  494. }
  495. count = (dist - 2*min_n_bins + 2)*(dist^2 + 2*dist*min_n_bins + dist - 2*(min_n_bins - 2)*min_n_bins)*(N - dist) / 6
  496. return(count)
  497. }
  498. plot_tree <- function(tree, seg_col='blue', which_part='upper', indices_of_slice=NULL, ...)
  499. {
  500. root_node = igraph::V(tree)[1]
  501. leaves = igraph::V(tree)[which(igraph::degree( tree, mode='out' ) == 0)]
  502. plot_inner <- function( node ){
  503. left = node$left
  504. right = node$right
  505. if(right - left == 1)
  506. {
  507. return()
  508. }
  509. x0 = left - 0.5
  510. x1 = right + 0.5
  511. if(which_part=='upper')
  512. {
  513. segments(x0, x0, x0, x1, col=seg_col, ...)
  514. segments(x0, x1, x1, x1, col=seg_col, ...)
  515. }
  516. if(which_part=='lower')
  517. {
  518. segments(x0, x0, x1, x0, col='black', ...)
  519. segments(x1, x1, x1, x0, col='black', ...)
  520. }
  521. if(which_part=='both')
  522. {
  523. segments(x0, x0, x0, x1, col=seg_col, ...)
  524. segments(x0, x1, x1, x1, col=seg_col, ...)
  525. segments(x0, x0, x1, x0, col='black', ...)
  526. segments(x1, x1, x1, x0, col='black', ...)
  527. }
  528. if( !is.null(indices_of_slice) )
  529. {
  530. if(left %in% indices_of_slice)
  531. segments(x0, x0, x0, x1, col='yellow', ...)
  532. if(right %in% indices_of_slice)
  533. segments(x0, x1, x1, x1, col='black', ...)
  534. }
  535. if( node %in% leaves ) return()
  536. twins = igraph::ego(tree, node=node, order=1, mindist=1, mode='out')[[1]]
  537. left_node = twins[1]
  538. plot_inner(left_node)
  539. if( length(twins) > 1 )
  540. {
  541. # cat('hello', '\n')
  542. right_node = twins[2]
  543. plot_inner(right_node)
  544. }
  545. }
  546. plot_inner(root_node)
  547. return()
  548. }
  549. remove_blank_cols <- function( mat, sparse=FALSE, row_or_col='both', ratio=0.05 )
  550. {
  551. if(sparse==TRUE)
  552. {
  553. if(row_or_col=='row')
  554. {
  555. non_zero_indices = (Matrix::rowSums(mat != 0) / ncol(mat) > ratio)
  556. new_mat = mat[non_zero_indices, ]
  557. return(new_mat)
  558. }
  559. if(row_or_col=='col')
  560. {
  561. non_zero_indices = (Matrix::colSums(mat != 0) / nrow(mat) > ratio)
  562. new_mat = mat[, non_zero_indices]
  563. return(new_mat)
  564. }
  565. if( ratio > 0 ) ## remove col/rows with non-zero number smaller than 5% percentile
  566. {
  567. positive_num = unname(Matrix::rowSums(mat != 0))
  568. positive_num_thresh = quantile(positive_num[positive_num > 0], ratio)
  569. non_zero_indices = (Matrix::rowSums(mat != 0) > positive_num_thresh)
  570. new_mat = mat[non_zero_indices, non_zero_indices]
  571. return(new_mat)
  572. }
  573. non_zero_indices = (Matrix::rowSums(mat != 0) / ncol(mat) > ratio)
  574. new_mat = mat[non_zero_indices, non_zero_indices]
  575. return(new_mat)
  576. }
  577. col_sum = apply( mat, 2, sum )
  578. blank_col_indices = which( col_sum==0 )
  579. new_mat = mat[ -blank_col_indices, -blank_col_indices ]
  580. return(new_mat)
  581. }
  582. tree_germination <- function(tree)
  583. {
  584. leaves = get_leaves( tree )
  585. for( leaf in leaves )
  586. {
  587. cat(leaf, '\n')
  588. if((igraph::vcount(tree) - ecount(tree)) !=1) break
  589. bins = as.character(igraph::V(tree)[leaf]$left:igraph::V(tree)[leaf]$right)
  590. if(length(bins) == 1) stop('Some leave node contains only one bin, please check')
  591. if(length(bins) == 2) ## this is an exception. added on 11/06/2018
  592. {
  593. nodes_supp = as.character(bins)
  594. edges2add = c(leaf, bins[1], leaf, bins[2])
  595. tree = tree %>% igraph::add_vertices(length(nodes_supp), name=nodes_supp) %>% igraph::add_edges(edges2add)
  596. next
  597. }
  598. joints_supp = paste('j', bins[1:(length(bins) - 2)], sep='')
  599. joints = c(leaf, joints_supp, bins[length(bins)])
  600. edges2add = character()
  601. for(i in 1:(length(joints)-1))
  602. {
  603. edges2add = c(edges2add, c( joints[i], bins[i], joints[i], joints[i+1] ))
  604. }
  605. nodes_supp = c(joints_supp, bins)
  606. tree = tree %>% igraph::add_vertices(length(nodes_supp), name=nodes_supp) %>% igraph::add_edges(edges2add)
  607. }
  608. return( tree )
  609. }
  610. trim_tree_adaptive <- function( tree, L_diff_thresh=-Inf, max_imp_p=Inf, max_nimp_p=Inf, width_thresh=-Inf, boundary_signal_thresh=Inf, peak_thresh=-Inf )
  611. {
  612. if(igraph::vcount(tree) ==1) return(tree) ## so this can be used!
  613. width_thresh = min(width_thresh, igraph::V(tree)[1]$width-1) ## cannot merge if tree is already very small
  614. nodes2rm_Ldiff = igraph::V(tree)[which( igraph::V(tree)$L_diff <= L_diff_thresh )]$name
  615. nodes2rm_imp_p = igraph::V(tree)[which( igraph::V(tree)$imp_p >= max_imp_p )]$name
  616. nodes2rm_nimp_p = igraph::V(tree)[which( igraph::V(tree)$nimp_p >= max_nimp_p )]$name
  617. nodes2rm_boundary_signal = igraph::V(tree)[which( igraph::V(tree)$binsignal <= boundary_signal_thresh )]$name
  618. nodes2rm_not_peak = igraph::V(tree)[which( igraph::V(tree)$is_peak <= peak_thresh )]$name
  619. nodes2rm = unique( c( nodes2rm_Ldiff, nodes2rm_imp_p, nodes2rm_nimp_p, nodes2rm_boundary_signal, nodes2rm_not_peak ) )
  620. nodes_not_2rm = setdiff( igraph::V(tree)$name, nodes2rm )
  621. flags = sapply( nodes2rm, function(v) { children = names(unlist( igraph::ego( tree, nodes = v, order=igraph::diameter(tree), mindist=1, mode='out' ))); all(children %in% nodes2rm) } )
  622. nodes2rm_final = names(unlist( igraph::ego(tree, nodes = nodes2rm[which(flags==TRUE)], order=igraph::diameter(tree), mindist=1, mode='out' ) ) )
  623. if(!is.null(nodes2rm_final)) tree = tree - nodes2rm_final
  624. if(is.null(nodes2rm_final)) cat('No nodes are removed at the given p_thresh\n')
  625. if( width_thresh==-Inf ) return( tree )
  626. tree = trim_tree_adaptive_width_thresh(tree, width_thresh)
  627. return( tree )
  628. }
  629. trim_tree_adaptive_width_thresh <- function( tree, width_thresh )
  630. {
  631. ## first remove all nodes of a parent if both are < width_thresh
  632. nodes2rm_width = igraph::V(tree)[which( igraph::V(tree)$width <= width_thresh )]$name
  633. parentOfnodes2rm_width = names(unlist(igraph::ego(tree, node=nodes2rm_width, order=1, mindist=1, mode='in')))
  634. ## if two nodes have the same parent, remove them
  635. index2rm = c(which(duplicated(parentOfnodes2rm_width)), which(duplicated(fromLast=TRUE, parentOfnodes2rm_width)))
  636. nodes2rm = nodes2rm_width[index2rm]
  637. tree = tree - nodes2rm
  638. if(!is_binary_tree(tree)) stop('Error in trim_tree_adaptive_width_thresh')
  639. ## merge TADs that are too small
  640. nodes2rm_width = igraph::V(tree)[which( igraph::V(tree)$width <= width_thresh )]$name
  641. di = igraph::diameter(tree)
  642. while( length(nodes2rm_width) > 0 )
  643. {
  644. node = nodes2rm_width[1]
  645. parent = igraph::ego( tree, nodes = node, order=1, mindist=1, mode='in' )[[1]]
  646. nodesOfSameParent = igraph::ego( tree, nodes = parent, order=di, mindist=1, mode='out' )[[1]]
  647. left_siblings = nodesOfSameParent[which(nodesOfSameParent$right < igraph::V(tree)[node]$left)]
  648. right_siblings = nodesOfSameParent[which(nodesOfSameParent$left > igraph::V(tree)[node]$right)]
  649. left_siblings2change_right = left_siblings[left_siblings$right == (igraph::V(tree)[node]$left-1)]
  650. right_siblings2change_left = right_siblings[right_siblings$left == (igraph::V(tree)[node]$right+1)]
  651. igraph::V(tree)[ left_siblings2change_right ]$right = igraph::V(tree)[node]$right
  652. igraph::V(tree)[ right_siblings2change_left ]$left = igraph::V(tree)[node]$left
  653. igraph::V(tree)$width = igraph::V(tree)$right - igraph::V(tree)$left + 1
  654. tree = tree - node
  655. nodes2rm_width = igraph::V(tree)[which( igraph::V(tree)$width <= width_thresh )]$name
  656. }
  657. igraph::V(tree)$name = paste('(',igraph::V(tree)$left, ',', igraph::V(tree)$right, ')', sep='')
  658. return(tree)
  659. }
  660. ## should be different from trim_tree_adaptive_width_thresh because binsignal is not monotonic
  661. trim_tree_adaptive_binsig_thresh <- function( tree, boundary_signal_thresh )
  662. {
  663. ## merge TADs that are too small
  664. parent_of_nodes2rm = igraph::V(tree)[which( igraph::V(tree)$binsignal <= boundary_signal_thresh )]$name
  665. leaves = get_leaves( tree )
  666. nodes2rm = intersect(leaves, names(unlist(igraph::ego( tree, nodes = parent_of_nodes2rm, order=1, mindist=1, mode='out' ))))
  667. while( length(nodes2rm) > 0 )
  668. {
  669. di = igraph::diameter(tree)
  670. node = nodes2rm[1]
  671. parent = igraph::ego( tree, nodes = node, order=1, mindist=1, mode='in' )[[1]]
  672. sibling = setdiff(igraph::ego( tree, nodes = parent, order=1, mindist=1, mode='out' )[[1]]$name, node)
  673. twins_of_sibling = igraph::ego( tree, nodes = sibling, order=1, mindist=1, mode='out' )[[1]]$name
  674. nodesOfSameParent = igraph::ego( tree, nodes = parent, order=di, mindist=1, mode='out' )[[1]]
  675. left_siblings = nodesOfSameParent[which(nodesOfSameParent$right < igraph::V(tree)[node]$left)]
  676. right_siblings = nodesOfSameParent[which(nodesOfSameParent$left > igraph::V(tree)[node]$right)]
  677. left_siblings2change_right = left_siblings[left_siblings$right == (igraph::V(tree)[node]$left-1)]
  678. right_siblings2change_left = right_siblings[right_siblings$left == (igraph::V(tree)[node]$right+1)]
  679. igraph::V(tree)[ left_siblings2change_right ]$right = igraph::V(tree)[node]$right
  680. igraph::V(tree)[ right_siblings2change_left ]$left = igraph::V(tree)[node]$left
  681. if(length(twins_of_sibling) > 0) tree = igraph::add_edges(tree, c( parent$name, twins_of_sibling[1], parent$name, twins_of_sibling[2]) )
  682. tree = tree - node - sibling
  683. igraph::V(tree)$width = igraph::V(tree)$right - igraph::V(tree)$left + 1
  684. parent_of_nodes2rm = igraph::V(tree)[which( igraph::V(tree)$binsignal <= boundary_signal_thresh )]$name
  685. leaves = get_leaves( tree )
  686. nodes2rm = intersect(leaves, names(unlist(igraph::ego( tree, nodes = parent_of_nodes2rm, order=1, mindist=1, mode='out' ))))
  687. }
  688. igraph::V(tree)$name = paste('(',igraph::V(tree)$left, ',', igraph::V(tree)$right, ')', sep='')
  689. return(tree)
  690. }
  691. visualize_left_or_right <- function(left_or_right, ...)
  692. {
  693. len = length( left_or_right )
  694. plot(1, type="n", xlab="", ylab="", xlim=c(0, len), ylim=c(-2, 2))
  695. for( i in 1:len )
  696. {
  697. if( left_or_right[i]==1 ) segments(i-0.1, 1, i+1, 1, col='red', ...)
  698. if( left_or_right[i]==-1 ) segments(i-0.1, -1, i+1, -1, col='blue', ...)
  699. }
  700. }
  701. xenocraft <- function( trunk, branches )
  702. {
  703. xenocraft_nodes = sapply(branches, function(v) igraph::V(v)[1]$name)
  704. if(!all(xenocraft_nodes %in% igraph::V(trunk)$name)) stop("Check xenocraft function")
  705. children_xenocraft_nodes = unique(unlist(sapply(igraph::ego(trunk, order=igraph::diameter(trunk), node=xenocraft_nodes, mode='out', mindist=1), function(v) v$name)))
  706. prunned_trunck = trunk - children_xenocraft_nodes
  707. full_tree = Reduce( union, c(list(prunned_trunck), branches) )
  708. att2delete = setdiff(vertex_attr_names(full_tree), 'name')
  709. for( att in att2delete ) full_tree = delete_vertex_attr(full_tree, att)
  710. return( full_tree )
  711. }
  712. fast_cor <- function(mat)
  713. {
  714. res = 1/( NROW( mat ) -1)*crossprod ( scale( mat , TRUE , TRUE ) )
  715. # scaled_mat = scale( mat , TRUE , TRUE )
  716. # res = 1/( NROW( mat ) -1)*matrix_multiplication_sym_cpp( scaled_mat )
  717. return(res)
  718. }
  719. fast_cor_cor <- function(mat, k)
  720. {
  721. scaled_mat = scale(mat , TRUE , TRUE )
  722. coeff = 1/( NROW( mat ) -1) ## there is no need to multiply this coeff when computing Pearson coeff
  723. res_begin = coeff*crossprod( scaled_mat[, 1:k], scaled_mat )
  724. for(j in (k+1):nrow(mat))
  725. {
  726. res_slice = res_begin[-1,]
  727. res_next = coeff*crossprod( scaled_mat[, j], scaled_mat )
  728. res_slice = rbind(res_slice, res_next)
  729. new_values = cor(t(res_slice), t(res_next))
  730. cat(j, '\n')
  731. }
  732. }
  733. get_bin_singals_CHiP <- function(chr, hc_ordered, res, ks, ChiP_NAME, CELL_LINE, ChiP_data_already_loaded=FALSE)
  734. {
  735. #################
  736. n_bins_of_CD = sapply(res$initial_clusters, length)
  737. pos_start_end = lapply(res$initial_clusters, function(v)
  738. {
  739. bins_ori = as.numeric(res$bin_names[v])
  740. from_id = min(bins_ori)
  741. to_id = max(bins_ori)
  742. start_pos = (from_id-1)*bin_size + 1
  743. end_pos = to_id*bin_size
  744. return(c(start_pos, end_pos))
  745. })
  746. pos_start_end = do.call(rbind, pos_start_end)
  747. #################
  748. ordered_bin_signals_ALL_k_list = lapply(ks, function(k){
  749. hc_k_labels = get_cluser_levels(hc_ordered, k_clusters=k, balanced_4_clusters=FALSE)$cluster_labels
  750. if(is.null(ChiP_NAME))
  751. {
  752. CD_index = labels(hc_ordered)
  753. pos_start_end = pos_start_end[match( CD_index, rownames(pos_start_end) ), ]
  754. ordered_bin_signals_ALL = data.frame(CD_index=CD_index, n_bins=n_bins_of_CD[CD_index], pos_start=pos_start_end[,1], pos_end=pos_start_end[,2], compartment=hc_k_labels[CD_index])
  755. return(ordered_bin_signals_ALL)
  756. }
  757. cluster_signal = get_names_by_H3k4me1(chr, res, ChiP_NAME, kmeans_cluster_assigment=hc_k_labels, CELL_LINE=CELL_LINE, ChiP_data_already_loaded=ChiP_data_already_loaded)
  758. # cluster_signal = rbind(cluster_signal, sorted_coverage)
  759. ordered_bin_signals_ALL = apply(cluster_signal, 1, function(v)
  760. {
  761. bin_signals = v[hc_k_labels]
  762. names(bin_signals) = names(hc_k_labels)
  763. ordered_bin_signals = bin_signals[labels(hc_ordered)] ## order by dendro topology
  764. return(ordered_bin_signals)
  765. })
  766. ordered_bin_signals_ALL = as.data.frame(ordered_bin_signals_ALL)
  767. # CD_index = rownames(ordered_bin_signals_ALL)
  768. CD_index = labels(hc_ordered)
  769. pos_start_end = pos_start_end[match( CD_index, rownames(pos_start_end) ), ]
  770. ordered_bin_signals_ALL = data.frame(CD_index=CD_index, n_bins=n_bins_of_CD[CD_index], pos_start=pos_start_end[,1], pos_end=pos_start_end[,2], compartment=hc_k_labels[CD_index], ordered_bin_signals_ALL)
  771. return(ordered_bin_signals_ALL)
  772. })
  773. names(ordered_bin_signals_ALL_k_list) = paste0('clusters_', ks)
  774. names(ordered_bin_signals_ALL_k_list)[which(names(ordered_bin_signals_ALL_k_list)=="clusters_Inf")] = 'compartment_domains'
  775. return(ordered_bin_signals_ALL_k_list)
  776. }
  777. densplot = function(x,y,points = FALSE, pch=19, cex=1, xlim=c(min(x),max(x)), ylim=c(min(y),max(y)), ...){
  778. df = data.frame(x,y)
  779. d = densCols(x,y, colramp=colorRampPalette(c("black", "white")))
  780. df$dens = col2rgb(d)[1,] + 1L
  781. cols = colorRampPalette(c("#000099", "#00FEFF", "#45FE4F","#FCFF00", "#FF9400", "#FF3100"))(256)
  782. df$col = cols[df$dens]
  783. df=df[order(df$dens),]
  784. if(points)
  785. points(df$x,df$y, pch=pch, col=df$col, cex=cex, ...)
  786. else
  787. plot(df$x,df$y, pch=pch, col=df$col, cex=cex, xlim=xlim, ylim=ylim, ...)
  788. }
  789. my_merge = function(...) merge(..., all=TRUE)
  790. build_chr_bin_domain_fun <- function( CELL_LINE, chrs, cluster_level, p_thresh, ob_oe, downsratio=NULL, compress_size=10 )
  791. {
  792. chrs = as.character(chrs)
  793. chr_bin_domain_tmp = lapply(chrs, function(chr) get_clusters_bins_xy(CELL_LINE, chr, cluster_level, p_thresh, ob_oe, downsratio=downsratio, compress_size=compress_size))
  794. names( chr_bin_domain_tmp ) = chrs ## can use mapply
  795. chr_bin_domain = lapply(chrs, function(v)
  796. {
  797. chr_bin_domain_ind = data.frame( chr=paste0('chr', v), bin_index=as.numeric(unlist(chr_bin_domain_tmp[[v]])), intra_domain=rep(names(chr_bin_domain_tmp[[v]]), sapply(chr_bin_domain_tmp[[v]], length)) )
  798. chr_bin_domain_ind = chr_bin_domain_ind[order(chr_bin_domain_ind$bin_index), ]
  799. return(chr_bin_domain_ind)
  800. })
  801. names(chr_bin_domain) = chrs
  802. res = do.call(rbind, chr_bin_domain)
  803. rownames(res) = NULL
  804. return( res )
  805. }
  806. build_chr_bin_domain_fun_direct <- function( chr, initial_clusters, cluster_vec, bin_names ) ## directly from R workingspace instead of loading
  807. {
  808. chrs = as.character(chr) ## for historic reason, chr -- chrs
  809. chr_bin_domain_tmp = lapply(chrs, function(chr) get_cluster_bin_names(initial_clusters, cluster_vec, bin_names))
  810. names( chr_bin_domain_tmp ) = chrs ## can use mapply
  811. chr_bin_domain = lapply(chrs, function(v) data.frame( chr=paste0('chr', v), bin_index=as.numeric(unlist(chr_bin_domain_tmp[[v]])), intra_domain=rep(names(chr_bin_domain_tmp[[v]]), sapply(chr_bin_domain_tmp[[v]], length)) ))
  812. names(chr_bin_domain) = chrs
  813. return( do.call(rbind, chr_bin_domain) )
  814. }
  815. get_clusters_bins_xy <- function(CELL_LINE, chr, cluster_level, p_thresh, ob_oe='oe', downsratio, compress_size, sort=TRUE)
  816. {
  817. if(ob_oe=='oe') sub_folder = paste0('./', CELL_LINE, '/oe_chr_', chr, '_', bin_size/1E3, 'kb_', compress_size, 'to1_', p_thresh)
  818. if(ob_oe=='ob') sub_folder = paste0('./', CELL_LINE, '/ob_chr_', chr, '_', bin_size/1E3, 'kb_', compress_size, 'to1_', p_thresh)
  819. if(!is.null(downsratio)) compartments_Rdata_file = paste0(sub_folder, '/chr', chr, '_compartments_atanh_log_AB', ws, 'downsratio_', downsratio, '.Rdata')
  820. if(is.null(downsratio)) compartments_Rdata_file = paste0(sub_folder, '/chr', chr, '_compartments_atanh_log_AB3_3.Rdata')
  821. load(compartments_Rdata_file)
  822. clusters_bins = get_cluster_bin_names(sort=sort, res$initial_clusters, res$clusters[, cluster_level], res$bin_names)
  823. rm( res )
  824. return(clusters_bins)
  825. }
  826. get_cluster_bin_names <- function(initial_clusters, cluster_vec, bin_names, sort=TRUE)
  827. {
  828. if(sort==TRUE) cluster_indices = sort(unique(cluster_vec), decreasing=TRUE)
  829. if(sort==FALSE) cluster_indices = unique(cluster_vec)
  830. cluster_bins = lapply(cluster_indices, function(v)
  831. {
  832. indices = which(cluster_vec==v)
  833. bin_names[unlist(initial_clusters[indices])]
  834. })
  835. names(cluster_bins) = cluster_indices
  836. return(cluster_bins)
  837. }
  838. # ave_cor <- function(mat, seg_len) ## seg_len is the length of a segment
  839. # {
  840. # d = 1:nrow(mat)
  841. # seq_index = split(d, ceiling(seq_along(d)/10))
  842. # tmp = simplify2array( lapply(seq_index[1:22], function(v) fast_cor( mat[v, ] )/10*length(v)) )
  843. # ave_cor_val = rowMeans(tmp, dims = 2)
  844. # }
  845. # if( kmeans_cluster_assigment ): compute the cluster_level signals
  846. reorder_dendro <- function(hc_object, named_weights, return_g=FALSE, aggregateFun=mean)
  847. {
  848. get_children <- function(g, root_node)
  849. {
  850. leaves = igraph::V(g)[igraph::degree(g)==1]$name
  851. children = intersect(leaves, igraph::ego(g, mode='out', root_node, order=igraph::diameter(g))[[1]]$name)
  852. return(children)
  853. }
  854. # hc_dendro = as.dendrogram(hc_object)
  855. # cat('hello I am here\n')
  856. # print(class(hc_object))
  857. g = igraph::as.igraph(ape::as.phylo(hc_object))
  858. # stop('I am here')
  859. leaves = igraph::V(g)[igraph::degree(g)==1]$name
  860. igraph::V(g)$weight = sapply(1:igraph::vcount(g), function(v) aggregateFun(named_weights[get_children(g, v)]))
  861. if(return_g==TRUE) return(g)
  862. swap_branches <- function(g, root_node)
  863. {
  864. twins = igraph::ego(g, mode='out', root_node, order=1, mindist=1)[[1]]$name
  865. twins_weight = igraph::V(g)[twins]$weight
  866. if(twins_weight[1] > twins_weight[2])
  867. {
  868. children_of_twin_A = get_children(g, twins[1])
  869. children_of_twin_B = get_children(g, twins[2])
  870. leaves = swap_names( leaves, children_of_twin_A, children_of_twin_B )
  871. }
  872. return(leaves)
  873. }
  874. swap_names <- function( leaves, names2swap_A, names2swap_B )
  875. {
  876. names2swap_indices = match(c(names2swap_A, names2swap_B), leaves)
  877. leaves[ names2swap_indices ] = c(names2swap_B, names2swap_A)
  878. return(leaves)
  879. }
  880. for( root_node in setdiff(igraph::V(g)$name, leaves) )
  881. {
  882. leaves = swap_branches(g, root_node)
  883. # cat(leaves, '\n')
  884. }
  885. return(leaves)
  886. }
  887. get_cluser_assignment = function(hc, k_clusters, leaves_hclust_pc)
  888. {
  889. clusters_raw = cutree(hc, k_clusters)
  890. clusters = tapply(as.numeric(names(clusters_raw)), clusters_raw, function(v) list(v))
  891. od = rank(sapply( clusters, function(v) sort(match(v, as.numeric(leaves_hclust_pc)))[1]))
  892. cluster_assignment = numeric(sum(sapply(clusters, length)))
  893. for(j in 1:length(clusters)) cluster_assignment[clusters[[j]]] = od[j]
  894. return( cluster_assignment )
  895. }
  896. get_cluster_boudaries <- function(hc, k_clusters, named_weights)
  897. {
  898. len = length(labels(hc))
  899. clusters_raw = cutree(hc, k_clusters)
  900. clusters = tapply(as.numeric(names(clusters_raw)), clusters_raw, function(v) list(v))
  901. clusters_pc_rank = lapply(clusters, function(v) unname(sort(rank(named_weights)[v])))
  902. bondaries = sapply(clusters_pc_rank, function(v) tail(v,1))
  903. return(setdiff(bondaries, c(1, len)))
  904. }
  905. # get_cluser_vector <- function(hc, k_clusters, named_weights) ## get cluster assignment of 1,2,3,4,5... (A1, A2, ...)
  906. # {
  907. # clusters_raw = cutree(hc, k_clusters)
  908. # clusters = tapply(as.numeric(names(clusters_raw)), clusters_raw, function(v) list(v))
  909. # clusters_pc_rank = unname(rank(sapply(clusters, function(v) unname(sort(rank(named_weights)[v]))[1])))
  910. # cluster_vector = numeric()
  911. # for( i in 1:length(clusters) ) cluster_vector[ clusters[[i]] ] = clusters_pc_rank[i]
  912. # return( cluster_vector )
  913. # }
  914. ## if return binary tree, A1, A2, B1, B2 are forced to be returned
  915. get_cluser_levels <- function(hc_ordered, k_clusters, balanced_4_clusters=FALSE) ## get the detailed A1, A2, ..., B1, B2...
  916. {
  917. assign_twins_name <- function(graph, node)
  918. {
  919. twins = igraph::ego(graph, node, mode='out', order=1, mindist=1)[[1]]$name
  920. if(length(twins)==2) igraph::V(graph)[twins]$level_name = paste0(igraph::V(graph)[node]$level_name, '.', c(2,1))
  921. if(node==igraph::V(graph)[1]$name) igraph::V(graph)[twins]$level_name = c('B', 'A')
  922. return(graph)
  923. }
  924. if(k_clusters==Inf) k_clusters = length(labels(as.dendrogram(hc_ordered)))
  925. #################################################
  926. # cat('haha I am here\n')
  927. # print(class(hc_ordered))
  928. graph = igraph::as.igraph(ape::as.phylo(hc_ordered))
  929. leave_names = get_leaves(graph)
  930. bfs_names = igraph::bfs(graph, 1)$order$name
  931. dfs_names = igraph::dfs(graph, 1)$order$name
  932. igraph::V(graph)[1]$level_name = ''
  933. for( node in bfs_names )
  934. {
  935. graph = assign_twins_name(graph, node)
  936. # if( !any(is.na(igraph::V(graph)[common_father_name]$level_name)) ) break ## when all common_father_name have level_name
  937. if( !any(is.na(igraph::V(graph)$level_name)) ) break ## when all common_father_name have level_name
  938. }
  939. if( balanced_4_clusters==TRUE ) ##A1, A2, B1, B2
  940. {
  941. branch_root_name = c('A.1', 'A.2', 'B.1', 'B.2')
  942. branch_root = match(branch_root_name, igraph::V(graph)$level_name)
  943. children = igraph::ego(graph, order = igraph::diameter(graph), nodes = branch_root, mode = 'out', mindist = 0) ## mindist==0, when itself is a branch
  944. tmp = lapply( children, function(v) intersect(leave_names, v$name) )
  945. cluster_labels = rep(branch_root_name, sapply(tmp, length))
  946. names(cluster_labels) = unlist( tmp )
  947. cluster_labels = cluster_labels[ as.character(1:length(cluster_labels)) ]
  948. return(cluster_labels)
  949. }
  950. #################################################
  951. clusters_raw = cutree(hc_ordered, k_clusters)[labels(hc_ordered)] ## labels are ordered according to pc
  952. clusters = tapply(as.numeric(names(clusters_raw)), clusters_raw, function(v) list(v))[unique(clusters_raw)]
  953. names(clusters) = 1:k_clusters # named by pc order
  954. ## get the vector. Named from 1 to 5 from left leaves to right leaves
  955. cluster_vector = rep( 1:k_clusters, sapply(clusters, length) )
  956. names(cluster_vector) = labels(hc_ordered)
  957. cluster_vector = cluster_vector[ as.character( sort(as.numeric(names(cluster_vector))) ) ]
  958. #################################################
  959. ## which node is the common father of all nodes in a cluster
  960. common_father_index = sapply( clusters, function(u)
  961. {
  962. if(length(u)==1) return( which(u==igraph::V(graph)$name) )
  963. return(max(which(sapply(igraph::ego(graph, order = igraph::diameter(graph), nodes = igraph::V(graph), mode = 'out', mindist = 1), function(v) all(u %in% v$name) ))))
  964. })
  965. common_father_name = igraph::V(graph)[common_father_index]$name
  966. ## whether common_father_name are ordered
  967. if(!all(diff(match( common_father_name, dfs_names )) >= 0)) stop('!!!!!!!!check get_all_children in header_funs.R')
  968. level_names = igraph::V(graph)[common_father_name]$level_name ## label names are ordered
  969. if(any(sort(level_names, decreasing=TRUE)!=level_names)) warning('!!!!!!!!Need to check get_all_children in header_funs.R')
  970. cluster_labels = rep( level_names, sapply(clusters, length) )
  971. names(cluster_labels) = labels(hc_ordered)
  972. cluster_labels = cluster_labels[ as.character( sort(as.numeric(names(cluster_labels))) ) ]
  973. return( list(cluster_vector=cluster_vector, cluster_labels=cluster_labels))
  974. }
  975. get_hkmeans_cluser_levels <- function(hk_cluster_centers, PC1)
  976. {
  977. hc = hclust(as.dist(my_dist(hk_cluster_centers)), method='com')
  978. reordered_names = as.character(rank(PC1))
  979. hc_ordered = dendextend::rotate(hc, reordered_names)
  980. hk_clust_labels = get_cluser_levels(hc_ordered, k_clusters=length(PC1))$cluster_labels
  981. return( hk_clust_labels )
  982. }
  983. ## names_A_final is the rownames of the A_final
  984. ## names_A_final is matched to the names of the starting contact matrix
  985. get_original_tad_indices <- function(names_A_final, TADs, bin_size)
  986. {
  987. start_pos = as.numeric(names_A_final[TADs$start_pos])
  988. end_pos = as.numeric(names_A_final[TADs$end_pos])
  989. start_pos_ori = (start_pos - 1)*bin_size + 1
  990. end_pos_ori = end_pos*bin_size
  991. TADs = data.frame( start_pos_ori=start_pos_ori, end_pos_ori=end_pos_ori )
  992. return( TADs )
  993. }
  994. ## This code updates the branch name
  995. ## Branches obtained from branches = lapply( res_inner, get_tree_v0 )
  996. ## The original branch name start from 1
  997. update_branch_name <- function(branch, root_start)
  998. {
  999. igraph::V(branch)$left = igraph::V(branch)$left + root_start - 1
  1000. igraph::V(branch)$right = igraph::V(branch)$right + root_start - 1
  1001. igraph::V(branch)$name = paste('(',igraph::V(branch)$left, ',', igraph::V(branch)$right, ')', sep='')
  1002. return(branch)
  1003. }