CALDER_main.R 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. CALDER = function(contact_tab_dump=NULL, contact_file_dump=NULL, contact_file_hic=NULL, chrs, bin_size, save_dir, save_intermediate_data=FALSE, swap_AB=0, genome='others', feature_track=NULL, black_list_bins=NULL, n_cores=1, sub_domains=FALSE, single_binsize_only=FALSE)
  2. {
  3. ########################### https://stackoverflow.com/questions/30216613/how-to-use-dopar-when-only-import-foreach-in-description-of-a-package
  4. `%dopar%` <- foreach::`%dopar%`
  5. `%do%` <- foreach::`%do%`
  6. ###########################
  7. chrs = as.character(chrs)
  8. n_chrs = length(chrs)
  9. if(n_chrs > 1) ## when computing for multiple chrs, contact_tab_dump or contact_file_dump should be a list with elements matching to that of chrs
  10. {
  11. if(!is.null(contact_tab_dump))
  12. {
  13. if(class(contact_tab_dump)!='list' | length(contact_tab_dump)!=n_chrs) stop('contact_tab_dump should be a list of contact table with its length equal to the length of chrs you specified\n')
  14. names(contact_tab_dump) = chrs
  15. }
  16. if(!is.null(contact_file_dump))
  17. {
  18. if(class(contact_file_dump)!='list' | length(contact_file_dump)!=n_chrs) stop('contact_file_dump should be a list of contact table with its length equal to the length of chrs you specified\n')
  19. names(contact_file_dump) = chrs
  20. }
  21. }
  22. ########################### try multiple bin_size parameters and choose the one that generates reliable compartments
  23. if(bin_size==5E3) bin_sizes = c(5E3, 10E3, 50E3, 100E3)
  24. if(bin_size==10E3) bin_sizes = c(10E3, 50E3, 100E3)
  25. if(bin_size==20E3) bin_sizes = c(20E3, 40E3, 100E3)
  26. if(bin_size==25E3) bin_sizes = c(25E3, 50E3, 100E3)
  27. if(bin_size==40E3) bin_sizes = c(40E3, 80E3)
  28. if(bin_size==50E3) bin_sizes = c(50E3, 100E3)
  29. if(!(bin_size %in% c(5E3, 10E3, 20E3, 25E3, 40E3, 50E3))) bin_sizes = bin_size
  30. if(genome=='others') single_binsize_only = TRUE
  31. if(single_binsize_only) bin_sizes = bin_size ## do not try multiple bin_sizes
  32. ## choose the already computed good compartment as reference to decide correctly the A/B compartment direction
  33. if(genome!='others') ref_compartment_file = system.file("extdata", sprintf('%s_all_sub_compartments.bed', genome), package = 'CALDER')
  34. if(genome=='others') ref_compartment_file = NULL
  35. ###########################
  36. if(is.null(ref_compartment_file) & is.null(feature_track)) stop('Should either specify genome in one of hg19, hg38, mm9, mm10, or provide a feature_track\n')
  37. ###########################
  38. doParallel::registerDoParallel(cores=n_cores)
  39. for(bin_size2look in bin_sizes)
  40. {
  41. bin_size_kb = sprintf("%skb", bin_size2look/1E3)
  42. save_dir_binsize = file.path(save_dir, 'intermediate_data/sub_compartments', bin_size_kb)
  43. dir.create(save_dir_binsize, recursive=TRUE, showWarnings=FALSE)
  44. }
  45. para_tab = data.table::data.table(expand.grid(bin_sizes, chrs))
  46. for(i in 1:ncol(para_tab)) para_tab[[i]] = as.vector(para_tab[[i]])
  47. colnames(para_tab) = c('bin_size', 'chr')
  48. ########################### compute compartment for each bin_size and chr
  49. silent_out = foreach::foreach(i=1:nrow(para_tab)) %dopar%
  50. {
  51. bin_size2look = para_tab$bin_size[i]
  52. chr = para_tab$chr[i]
  53. bin_size_kb = sprintf("%skb", bin_size2look/1E3)
  54. save_dir_binsize = file.path(save_dir, 'intermediate_data/sub_compartments', bin_size_kb)
  55. save_intermediate_data_tmp = save_intermediate_data*(bin_size2look==bin_size)
  56. CALDER_CD_hierarchy_v2(contact_tab_dump=contact_tab_dump[[chr]], contact_file_dump=contact_file_dump[[chr]], contact_file_hic=contact_file_hic, chr=chr, bin_size_input=bin_size, bin_size2look=bin_size2look, save_dir=save_dir_binsize, save_intermediate_data=save_intermediate_data_tmp, swap_AB=swap_AB, ref_compartment_file=ref_compartment_file, feature_track=feature_track, black_list_bins=black_list_bins)
  57. }
  58. ########################### build optimal compartment from multiple resolutions
  59. try(build_comp_table_opt(save_dir=save_dir, chrs=chrs, bin_sizes=bin_sizes, with_ref=!is.null(genome)))
  60. ########################### if computing sub-domains
  61. if(sub_domains==TRUE)
  62. {
  63. silence_out = foreach::foreach(chr=chrs) %do% ## use one core instead of multi-cores
  64. {
  65. chr_num = gsub('chr', '', chr, ignore.case=TRUE)
  66. intermediate_data_file = sprintf('%s/%skb/chr%s_intermediate_data.Rds', file.path(save_dir, 'intermediate_data/sub_compartments') , bin_size/1E3, chr_num)
  67. save_dir_sub_domains = file.path(save_dir, 'intermediate_data/sub_domains')
  68. dir.create(save_dir_sub_domains, showWarnings = FALSE)
  69. try(CALDER_sub_domains(intermediate_data_file=intermediate_data_file, chr=chr, save_dir=save_dir_sub_domains, bin_size=bin_size))
  70. }
  71. save_dir_opt = file.path(save_dir, "sub_domains")
  72. dir.create(save_dir_opt, recursive=TRUE, showWarnings=FALSE)
  73. save_opt_file = file.path(save_dir_opt, sprintf("all_nested_boundaries.bed"))
  74. opt_sub_domain_beds = sprintf('%s/chr%s_nested_boundaries.bed', file.path(save_dir, 'intermediate_data/sub_domains') , gsub('chr', '', chrs, ignore.case=TRUE))
  75. cmd_opt = sprintf("cat %s > %s", paste0(opt_sub_domain_beds, collapse=" "), save_opt_file)
  76. system(cmd_opt)
  77. }
  78. }