HRG_MLE_each_compartment_fun.R 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. ## Yuanlong LIU
  2. ## 12/04/2018
  3. ## 23/05/2018
  4. ## 29/05/2018
  5. ## A should be already named
  6. ## A should be symmetric
  7. ## Does not remove 0-rows/columns of A
  8. HRG_MLE_each_compartment_fun <- function(pA_sym, seg, allowed_max_nbins_seq=500:1000, max_nbins_fine=max(allowed_max_nbins_seq), ncores, res_dir, fast, adjust_segmentss_topdom=TRUE)
  9. {
  10. min_n_bins_outer=2
  11. min_n_bins_inner=2
  12. distr = 'lnorm'
  13. A = pA_sym[seg, seg]
  14. if( is.null(rownames(A)) | is.null(colnames(A))) stop('A should be named by the bin indices')
  15. arg_list = as.list(environment())
  16. res_folder = file.path(res_dir)
  17. dir.create(res_folder, recursive=TRUE, showWarnings = TRUE)
  18. total_execution_time_file = file.path(res_dir, 'total_execution.time')
  19. cat('n_core used:', ncores, '\n\n', file=total_execution_time_file, append=FALSE)
  20. time_begin = Sys.time()
  21. cat('Execution begins:', as.character(time_begin), '\n', file=total_execution_time_file, append=TRUE)
  22. ## max_nbins is defined as the one from allowed_max_nbins_seq has the smallest residue, from which, the smallest is taken
  23. n_A = nrow(A)
  24. ## IF THE SEGMENT IS ALREADY SMALL ENOUGH
  25. if( n_A <= max(allowed_max_nbins_seq) )
  26. {
  27. ## This will be modified
  28. {
  29. dists = (2*min_n_bins_outer - 1):(nrow(A)-1)
  30. counts = sapply(dists, function(v) n_cells2compute( A, v, min_n_bins_outer ))
  31. split_info = split2BanlancedChunks_min_max(vec=counts, K=min(length(counts)-1, ncores))
  32. chunks = c(dists[1]-1, dists[split_info$break_points])
  33. res_outer = HRG_core( A=A, chunks=chunks, res_folder=NULL, min_n_bins=min_n_bins_outer, distr=distr, fast=fast )
  34. }
  35. ## this part gets the hi_tree
  36. {
  37. hi_tree = get_tree_v0( res_outer )
  38. igraph::V(hi_tree)$left_rescaled = igraph::V(hi_tree)$left
  39. igraph::V(hi_tree)$right_rescaled = igraph::V(hi_tree)$right
  40. igraph::V(hi_tree)$width_rescaled = igraph::V(hi_tree)$right - igraph::V(hi_tree)$left + 1
  41. igraph::V(hi_tree)$name = paste('(',igraph::V(hi_tree)$left_rescaled, ',', igraph::V(hi_tree)$right_rescaled, ')', sep='')
  42. }
  43. full_tree = hi_tree
  44. res_info = list( arg_list=arg_list, trunk=NULL, segmentss_nadj=NULL, segmentss=NULL, res_outer=NULL, res_inner=list(res_outer) )
  45. res_info$full_tree = full_tree
  46. res_folder_final = file.path(res_dir, 'final')
  47. dir.create(res_folder_final, recursive=TRUE, showWarnings = TRUE)
  48. save(res_info, file=file.path(res_folder_final, 'res_info.Rdata'))
  49. time_finish = Sys.time()
  50. cat('Execution finishes:', as.character(time_finish), '\n\n', file=total_execution_time_file, append=TRUE)
  51. cat('Total execution time:', capture.output( time_finish - time_begin ), '\n\n', file=total_execution_time_file, append=TRUE)
  52. return(res_info)
  53. }
  54. ## which one leads to the least residue
  55. max_nbins = allowed_max_nbins_seq[ min(which(n_A%%allowed_max_nbins_seq==min(n_A%%allowed_max_nbins_seq))) ]
  56. residue = n_A %% max_nbins
  57. residue_mat_info = get_least_residue_matrix(A, max_nbins=max_nbins, allowed_shifts=0)
  58. A_final = residue_mat_info$A_final
  59. n2one = residue_mat_info$n2one
  60. ## A_low has no row/col name
  61. A_low = HighResolution2Low_k( A_final, k=max_nbins ) ## k is the dimension of A_final_low
  62. ## Starting from here, the original row/col names are no longer used
  63. ## this part run HRG_core on the outer part
  64. {
  65. dists = (2*min_n_bins_outer - 1):(nrow(A_low)-1)
  66. counts = sapply(dists, function(v) n_cells2compute( A_low, v, min_n_bins_outer ))
  67. split_info = split2BanlancedChunks_min_max(vec=counts, K=min(length(counts)-1, ncores))
  68. chunks = c(dists[1]-1, dists[split_info$break_points])
  69. res_outer = HRG_core( A=A_low, chunks=chunks, res_folder=NULL, min_n_bins=min_n_bins_outer, distr=distr, fast=fast )
  70. }
  71. ## this part gets the hi_tree
  72. {
  73. hi_tree = get_tree_v0( res_outer )
  74. igraph::V(hi_tree)$left_rescaled = n2one*(igraph::V(hi_tree)$left - 1) + 1
  75. igraph::V(hi_tree)$right_rescaled = n2one*igraph::V(hi_tree)$right
  76. node2adjust = which(igraph::V(hi_tree)$right_rescaled==max(igraph::V(hi_tree)$right_rescaled)) ## append the residue bins here
  77. igraph::V(hi_tree)[node2adjust]$right_rescaled = n_A
  78. igraph::V(hi_tree)$width_rescaled = igraph::V(hi_tree)$right_rescaled - igraph::V(hi_tree)$left_rescaled + 1
  79. igraph::V(hi_tree)$name = paste('(',igraph::V(hi_tree)$left_rescaled, ',', igraph::V(hi_tree)$right_rescaled, ')', sep='')
  80. }
  81. # segmentss = get_segments(hi_tree, binsize_thresh=max_nbins_bp)
  82. segmentss_nadj = get_segments(hi_tree, binsize_thresh=max_nbins_fine)
  83. ## THIS PARAT ADJUST THE segmentss BASED ON TOPDOM BIN_SIGNAL
  84. ## 14-05-2018
  85. ## Also adjust the tree
  86. if(adjust_segmentss_topdom==TRUE)
  87. {
  88. shift = seg[1]-1
  89. segmentss = segmentss_adjust_topdom(pA_sym, segmentss_nadj+shift, n2one, ws=5:20) - shift
  90. for( i in 1:nrow(segmentss) )
  91. {
  92. index_left2adj = which(igraph::V(hi_tree)$left_rescaled == segmentss_nadj[i,1])
  93. index_right2adj = which(igraph::V(hi_tree)$right_rescaled == segmentss_nadj[i,2])
  94. igraph::V(hi_tree)[index_left2adj]$left_rescaled = segmentss[i,1]
  95. igraph::V(hi_tree)[index_right2adj]$right_rescaled = segmentss[i,2]
  96. igraph::V(hi_tree)$width_rescaled = igraph::V(hi_tree)$right_rescaled - igraph::V(hi_tree)$left_rescaled + 1
  97. igraph::V(hi_tree)$name = paste('(',igraph::V(hi_tree)$left_rescaled, ',', igraph::V(hi_tree)$right_rescaled, ')', sep='')
  98. }
  99. }
  100. trunk = hi_tree
  101. # save(segmentss, file=file.path(res_dir, 'outer', 'segmentss.Rdata'))
  102. res_inner = rep(list(list()), nrow(segmentss))
  103. for( i in 1:nrow(segmentss) )
  104. {
  105. cat(i,'\n')
  106. # if(i==2) while(as.numeric(substr(as.character(Sys.time()),12,13))!=20) {}
  107. index = segmentss[i,1]:segmentss[i,2]
  108. A_part = A[index, index]
  109. dists = (2*min_n_bins_inner - 1):(nrow(A_part)-1)
  110. counts = sapply(dists, function(v) n_cells2compute( A_part, v, min_n_bins_inner ))
  111. split_info = split2BanlancedChunks_min_max(vec=counts, K=min(length(counts)-1, ncores))
  112. chunks = c(dists[1]-1, dists[split_info$break_points])
  113. ## A_part is named
  114. res_info_inner = HRG_core( A=A_part, chunks=chunks, res_folder=NULL, min_n_bins=min_n_bins_inner, distr=distr, fast=fast )
  115. res_inner[[i]] = res_info_inner #minimum size filtering
  116. cat('finished', '\n')
  117. }
  118. res_info = list( arg_list=arg_list, trunk=trunk, segmentss_nadj=segmentss_nadj, segmentss=segmentss, res_outer=res_outer, res_inner=res_inner )
  119. res_folder_final = file.path(res_dir, 'final')
  120. dir.create(res_folder_final, recursive=TRUE, showWarnings = TRUE)
  121. save(res_info, file=file.path(res_folder_final, 'res_info.Rdata'))
  122. time_finish = Sys.time()
  123. cat('Execution finishes:', as.character(time_finish), '\n\n', file=total_execution_time_file, append=TRUE)
  124. cat('Total execution time:', capture.output( time_finish - time_begin ), '\n\n', file=total_execution_time_file, append=TRUE)
  125. ## xenocfraf:
  126. # res_inner = res_inner[sapply(res_inner, length) > 0]
  127. branches = lapply( res_inner, get_tree_v0 )
  128. for( i in 1:length(branches) ) branches[[i]] = update_branch_name(branches[[i]], root_start=segmentss[i,1])
  129. full_tree = xenocraft( trunk, branches )
  130. names_tmp = do.call(rbind, strsplit(igraph::V(full_tree)$name, ','))
  131. igraph::V(full_tree)$left = substring(names_tmp[,1], 2)
  132. igraph::V(full_tree)$right = substring(names_tmp[,2], 1, nchar(names_tmp[,2])-1)
  133. if(!is_binary_tree(full_tree)) stop("Trunk + branches do not produce a binary tree")
  134. res_info$full_tree = full_tree
  135. save(res_info, file=file.path(res_folder_final, 'res_info.Rdata'))
  136. return( res_info )
  137. }