zigzag_nested_domain.R 3.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. ## Yuanlong LIU
  2. ## 12/04/2018
  3. ## 06/06/2018
  4. ## 16/06/2018
  5. ## This code runs zigzag search on each compartment domain as a whole without dividing into 400
  6. ## 09-10-2018
  7. ## A should be already named
  8. ## "zigzag" resembles computational path of finding the optimum
  9. # HRG_zigzag_compartment_domain_main_fun <- function(A, res_dir, compartment_segs, allowed_max_nbins_seq, max_nbins_fine, chr, min_n_bins=2)
  10. HRG_zigzag_compartment_domain_main_fun <- function(A, res_dir, compartment_segs, allowed_max_nbins_seq, max_nbins_fine, min_n_bins=2)
  11. {
  12. arg_list = as.list(environment())
  13. res_folder = file.path(res_dir)
  14. dir.create(res_folder, recursive=TRUE, showWarnings = FALSE)
  15. total_execution_time_file = file.path(res_dir, 'total_execution.time')
  16. time_begin = Sys.time()
  17. cat('Execution begins:', as.character(time_begin), '\n', file=total_execution_time_file, append=TRUE)
  18. ## check whether your input matrix is symmetric
  19. # A_sym = as.matrix( Matrix::forceSymmetric(data.matrix(A), uplo='U') )
  20. # A_sym = Matrix::forceSymmetric(data.matrix(A), uplo='U') ## keep sparse, 2018-11-11
  21. A_sym = Matrix::forceSymmetric(A, uplo='U') ## keep sparse, 2018-11-11
  22. tol = 100 * .Machine$double.eps
  23. max_diff = max(abs(A_sym - A))
  24. notSym_flag = max_diff > tol
  25. if( notSym_flag ) warning('Your input contact matrix is not symmetric. The maximum difference between symmetric values is: ', max_diff, '\nBy default, the contact profile used for downstream analysis is taken from the upper triangular part. To use the lower triangular part you can transpose the input contact matrix first')
  26. if( is.null(rownames(A)) | is.null(colnames(A))) stop('A should be named by the bin indices')
  27. # pA_sym = rm_zeros(A_sym) ## pA_sym: positive A
  28. pA_sym = remove_blank_cols(A_sym, sparse=TRUE, ratio=0)
  29. n_zero_rows = nrow(A_sym) - nrow(pA_sym)
  30. zero_rows_flag = n_zero_rows > 0
  31. if( zero_rows_flag )
  32. {
  33. warning('There are ', n_zero_rows, ' rows/columns in your input contact matrix that have all their values being 0. These rows/columns are removed for downstream analysis')
  34. original_row_names = rownames(A_sym)
  35. kept_row_names = rownames(pA_sym)
  36. }
  37. # res_inner = rep(list(), nrow(compartment_segs)) ## for each compartment domain
  38. # for( i in 1:nrow(compartment_segs) )
  39. # {
  40. # seg = compartment_segs[i,1]:compartment_segs[i,2]
  41. # cat('Compute seg:', i, 'of length:', length(seg), '\n')
  42. # A_seg = as.matrix(pA_sym[seg, seg])
  43. # res_zigzag = zigzag_loglik_ancestors_v4(A_seg, nrow(A_seg))
  44. # res_outer = list(A=A_seg, L=res_zigzag$L, ancestors=res_zigzag$ancestors)
  45. # res_inner[[i]] = res_outer
  46. # cat('finished', '\n')
  47. # }
  48. ## changed to paralell, 2018-11-11
  49. cat('\n')
  50. res_inner = foreach::foreach(i=1:nrow(compartment_segs)) %do%
  51. {
  52. seg = compartment_segs[i,1]:compartment_segs[i,2]
  53. cat('\r', sprintf('Find sub-domains in %d of %d CDs | length of current CD: %d bins', i, nrow(compartment_segs), length(seg)))
  54. A_seg = as.matrix(pA_sym[seg, seg])
  55. res_zigzag = zigzag_loglik_ancestors_v4_5(A_seg, nrow(A_seg), min_n_bins=min_n_bins)
  56. res_outer = list(A=A_seg, L=res_zigzag$L, ancestors=res_zigzag$ancestors, min_n_bins=min_n_bins)
  57. res_outer
  58. # res_inner[[i]] = res_outer
  59. }
  60. cat('\n')
  61. segmentss = compartment_segs
  62. res_info = list( arg_list=arg_list, pA_sym=pA_sym, A_final=pA_sym, segmentss=segmentss, res_inner=res_inner )
  63. # res_folder_final = file.path(res_dir, 'final')
  64. # dir.create(res_folder_final, recursive=TRUE, showWarnings = TRUE)
  65. # save(res_info, file=file.path(res_folder_final, 'res_info.Rdata'))
  66. time_finish = Sys.time()
  67. cat('Execution finishes:', as.character(time_finish), '\n\n', file=total_execution_time_file, append=TRUE)
  68. cat('Total execution time:', capture.output( time_finish - time_begin ), '\n\n', file=total_execution_time_file, append=TRUE)
  69. return( res_info )
  70. }