zigzag_nested_domain_v2.R 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. ## Yuanlong LIU
  2. ## 12/04/2018
  3. ## 06/06/2018
  4. ## 16/06/2018
  5. ## This code runs zigzag search on each compartment domain as a whole without dividing into 400
  6. ## 09-10-2018
  7. ## A should be already named
  8. ## "zigzag" resembles computational path of finding the optimum
  9. # HRG_zigzag_compartment_domain_main_fun <- function(A, res_dir, compartment_segs, allowed_max_nbins_seq, max_nbins_fine, chr, min_n_bins=2)
  10. HRG_zigzag_compartment_domain_main_fun <- function(A, res_dir, compartment_segs, allowed_max_nbins_seq=NULL, max_nbins_fine=NULL, min_n_bins=2)
  11. {
  12. `%dopar%` <- foreach::`%dopar%`
  13. `%do%` <- foreach::`%do%`
  14. arg_list = as.list(environment())
  15. res_folder = file.path(res_dir)
  16. dir.create(res_folder, recursive=TRUE, showWarnings = FALSE)
  17. total_execution_time_file = file.path(res_dir, 'total_execution.time')
  18. time_begin = Sys.time()
  19. cat('Execution begins:', as.character(time_begin), '\n', file=total_execution_time_file, append=TRUE)
  20. ## check whether your input matrix is symmetric
  21. # A_sym = as.matrix( Matrix::forceSymmetric(data.matrix(A), uplo='U') )
  22. # A_sym = Matrix::forceSymmetric(data.matrix(A), uplo='U') ## keep sparse, 2018-11-11
  23. A_sym = Matrix::forceSymmetric(A, uplo='U') ## keep sparse, 2018-11-11
  24. tol = 100 * .Machine$double.eps
  25. max_diff = max(abs(A_sym - A))
  26. notSym_flag = max_diff > tol
  27. if( notSym_flag ) warning('Your input contact matrix is not symmetric. The maximum difference between symmetric values is: ', max_diff, '\nBy default, the contact profile used for downstream analysis is taken from the upper triangular part. To use the lower triangular part you can transpose the input contact matrix first')
  28. if( is.null(rownames(A)) | is.null(colnames(A))) stop('A should be named by the bin indices')
  29. # pA_sym = rm_zeros(A_sym) ## pA_sym: positive A
  30. pA_sym = as.matrix(remove_blank_cols(A_sym, sparse=TRUE, ratio=0))
  31. n_zero_rows = nrow(A_sym) - nrow(pA_sym)
  32. zero_rows_flag = n_zero_rows > 0
  33. if( zero_rows_flag )
  34. {
  35. warning('There are ', n_zero_rows, ' rows/columns in your input contact matrix that have all their values being 0. These rows/columns are removed for downstream analysis')
  36. original_row_names = rownames(A_sym)
  37. kept_row_names = rownames(pA_sym)
  38. }
  39. # res_inner = rep(list(), nrow(compartment_segs)) ## for each compartment domain
  40. # for( i in 1:nrow(compartment_segs) )
  41. # {
  42. # seg = compartment_segs[i,1]:compartment_segs[i,2]
  43. # cat('Compute seg:', i, 'of length:', length(seg), '\n')
  44. # A_seg = as.matrix(pA_sym[seg, seg])
  45. # res_zigzag = zigzag_loglik_ancestors_v4(A_seg, nrow(A_seg))
  46. # res_outer = list(A=A_seg, L=res_zigzag$L, ancestors=res_zigzag$ancestors)
  47. # res_inner[[i]] = res_outer
  48. # cat('finished', '\n')
  49. # }
  50. ## changed to paralell, 2018-11-11
  51. res_inner = foreach::foreach(i=1:nrow(compartment_segs)) %do%
  52. {
  53. seg = compartment_segs[i,1]:compartment_segs[i,2]
  54. cat('\r', sprintf('Find sub-domains in %d of %d CDs | length of current CD: %d bins\n', i, nrow(compartment_segs), length(seg)))
  55. A_seg = pA_sym[seg, seg]
  56. res_zigzag = zigzag_loglik_ancestors_v4_5(A_seg, nrow(A_seg), min_n_bins=min_n_bins)
  57. res_outer = list(A=A_seg, L=res_zigzag$L, ancestors=res_zigzag$ancestors, min_n_bins=min_n_bins)
  58. res_outer
  59. # res_inner[[i]] = res_outer
  60. }
  61. cat('\n')
  62. segmentss = compartment_segs
  63. res_info = list( arg_list=arg_list, pA_sym=pA_sym, A_final=pA_sym, segmentss=segmentss, res_inner=res_inner )
  64. # res_folder_final = file.path(res_dir, 'final')
  65. # dir.create(res_folder_final, recursive=TRUE, showWarnings = TRUE)
  66. # save(res_info, file=file.path(res_folder_final, 'res_info.Rdata'))
  67. time_finish = Sys.time()
  68. cat('Execution finishes:', as.character(time_finish), '\n', file=total_execution_time_file, append=TRUE)
  69. cat('Total execution time:', capture.output( time_finish - time_begin ), '\n', file=total_execution_time_file, append=TRUE)
  70. return( res_info )
  71. }