post_processing_nested_domains.R 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. get_cluster_index <- function(pos, initial_clusters, bin_names, bin_size)
  2. {
  3. compartment_segs = generate_compartment_segs( initial_clusters )
  4. CDs = get_original_tad_indices( bin_names, compartment_segs, bin_size=bin_size )
  5. cluster_index = which(apply( CDs, 1, function(v) (v[1] < pos)&(v[2]>pos) )==1)
  6. return(cluster_index)
  7. }
  8. get_original_tad_indices_extra <- function(names_A_final, TADs, bin_size) ## to get the TADs for the extra edges
  9. {
  10. # start_pos = as.numeric(names_A_final[TADs$start_pos])
  11. end_pos = as.numeric(names_A_final[TADs$end_pos])
  12. # start_pos_ori = (start_pos - 1)*bin_size + 1
  13. end_pos_ori = end_pos*bin_size
  14. TADs = data.frame( start_pos_ori=end_pos_ori+1, end_pos_ori=end_pos_ori )
  15. return( TADs )
  16. }
  17. LikelihoodRatioTest <- function(sub_domains_raw, ncores, remove_zero=FALSE, distr, n_parameters=3, imputation_num=1E2, A_already_corrected=FALSE)
  18. {
  19. `%dopar%` <- foreach::`%dopar%`
  20. `%do%` <- foreach::`%do%`
  21. pA_sym = sub_domains_raw$pA_sym
  22. if(A_already_corrected==TRUE) cpA_sym = pA_sym
  23. ## CAN SPEEDUP correct_A_fast_divide_by_mean BECAUSE ONLY SOME OFF-DIAGNAL LINES NEED TO BE CORRECTED
  24. if(A_already_corrected==FALSE) cpA_sym = correct_A_fast_divide_by_mean(pA_sym, remove_zero=remove_zero) ## corrected pA_sym
  25. trees = foreach::foreach(j=1:length( sub_domains_raw$res_inner )) %do%
  26. {
  27. name_index = rownames(sub_domains_raw$pA_sym)[sub_domains_raw$segmentss[j,1]:sub_domains_raw$segmentss[j,2]]
  28. sub_domains_raw$res_inner[[j]]$cA = as.matrix(cpA_sym[name_index, name_index]) ## the corrected A. Use this matrix is essential for reducing the distance-based false positves
  29. ## for example, when the number of invovled bins in sub_domains_raw$res_inner[[j]] is too small
  30. # tmp = try(get_tree_decoration( sub_domains_raw$res_inner[[j]], distr=distr, n_parameters=n_parameters, imputation_num=imputation_num ))
  31. tmp = get_tree_decoration( sub_domains_raw$res_inner[[j]], distr=distr, n_parameters=n_parameters, imputation_num=imputation_num )
  32. # if( class(tmp)=="try-error" )
  33. if(class(tmp)!='igraph') ## if(tmp=='bad_tree')
  34. {
  35. root_tree = igraph::graph.empty() + 'root'
  36. igraph::V(root_tree)$width = nrow(sub_domains_raw$res_inner[[j]]$A)
  37. igraph::V(root_tree)$left = 1
  38. igraph::V(root_tree)$right = igraph::V(root_tree)$width
  39. tmp = root_tree ## represented by the length of the segment
  40. }
  41. tmp
  42. }
  43. return(trees)
  44. }
  45. # pipline <- function(chr, TADs_raw, TADs_type, just_save_no_nest=FALSE, bin_size) ## TADs_tmp: start end
  46. create_TADs <- function(sub_domains_raw, chr, TADs_raw, TADs_type, just_save_no_nest=FALSE, bin_size) ## TADs_tmp: start end
  47. {
  48. chr_name = paste0('chr', chr)
  49. bin_size_kb = bin_size / 1E3
  50. if(just_save_no_nest)
  51. {
  52. TADs = get_original_tad_indices( rownames(sub_domains_raw$pA_sym), TADs_raw, bin_size=bin_size_kb*1E3 )
  53. TADs = cbind(chr_name, TADs)
  54. save( TADs, file=Tads_R_File )
  55. }
  56. TADs = get_original_tad_indices( rownames(sub_domains_raw$pA_sym), TADs_raw, bin_size=bin_size_kb*1E3 )
  57. # if(TADs_type=='extra') TADs = get_original_tad_indices_extra( rownames(sub_domains_raw$pA_sym), TADs_raw, bin_size=bin_size_kb*1E3 )
  58. # print(TADs)
  59. TADs = cbind(chr_name, TADs)
  60. return(TADs)
  61. }
  62. #####################################################################
  63. ## INDEED, WHEN RESOLUTION == 40KB, DO NOT REMOVE
  64. clean_TADs_all = function(TADs_all_raw, CDs, bin_size)
  65. {
  66. TADs_all_end = TADs_all_raw[,2]
  67. CD_end = CDs[,2]
  68. outer_mat = outer(TADs_all_end, CD_end, "-")
  69. min_dist = apply(outer_mat, 1, function(v) min(abs(v)))
  70. # end2rm = TADs_all_end[(min_dist <= 3) & (min_dist > 0)] ## remove nested boundaris too close to CD boundary, at least 40kb
  71. end2rm = TADs_all_end[(min_dist < 40E3/bin_size) & (min_dist > 0)] ## remove nested boundaris too close to CD boundary, at least 40kb
  72. ## INDEED, WHEN RESOLUTION == 40KB, DO NOT REMOVE
  73. TADs_all_head = TADs_all_raw[,1]
  74. CD_head = CDs[,1]
  75. outer_mat = outer(TADs_all_head, CD_head, "-")
  76. min_dist = apply(outer_mat, 1, function(v) min(abs(v)))
  77. head2rm = TADs_all_head[(min_dist < 40E3/bin_size) & (min_dist > 0)] ## remove nested boundaris too close to CD boundary
  78. TADs_all = subset(TADs_all_raw, (!(start_pos %in% head2rm)) & (!(end_pos %in% end2rm)) )
  79. return(TADs_all)
  80. }
  81. post_process_sub_domains = function(chr, sub_domains_raw, ncores, out_dir, bin_size)
  82. {
  83. distr=c('lnorm', 'wilcox')[2]
  84. remove_zero = FALSE
  85. n_parameters = 3
  86. imputation_num = 1E2
  87. A_already_corrected = FALSE
  88. decorated_branches = LikelihoodRatioTest(sub_domains_raw=sub_domains_raw, ncores=ncores, remove_zero=FALSE, distr=distr, n_parameters=n_parameters, imputation_num=imputation_num, A_already_corrected=A_already_corrected)
  89. chr_name = paste0('chr', chr)
  90. mean_diff_thresh = -0.1
  91. i = 1
  92. # for(i in 1:length(p0s))
  93. {
  94. # cat(i, '\n')
  95. # p0 = p0s[i]
  96. normal_decorated_branches = decorated_branches[sapply(decorated_branches, igraph::vcount) > 1]
  97. # ps = sort(unlist(lapply(normal_decorated_branches, function(v) igraph::V(v)$wilcox_p)), decreasing=FALSE) ## fdr correction
  98. # p0_adj = ps[min(which(p.adjust(ps, method = 'fdr') > p0))]
  99. # p0_adj = ps[min(which(p.adjust(ps, method = 'bonferroni') > p0))]
  100. p0_adj = 0.05
  101. TADs_all_raw = prunning(decorated_branches, to_correct=FALSE, p0=p0_adj, width_thresh=2, width_thresh_CD=width_thresh_CD, all_levels=TRUE, CD_border_adj=FALSE, top_down=TRUE, mean_diff_thresh=mean_diff_thresh)
  102. # CDs = prunning(decorated_branches, to_correct=FALSE, p0=p0_adj, width_thresh=2, width_thresh_CD=width_thresh_CD, all_levels=FALSE, CD_border_adj=FALSE, top_down=TRUE); cat(nrow(CDs), '\n')
  103. CDs = prunning(decorated_branches, to_correct=FALSE, p0=-1, width_thresh=2, width_thresh_CD=width_thresh_CD, all_levels=FALSE, CD_border_adj=FALSE, top_down=TRUE, mean_diff_thresh=mean_diff_thresh)
  104. # if(nrow(CDs)!=length(res$initial_clusters)) stop('nrow(CDs)!=length(res$initial_clusters)')
  105. TADs_all = clean_TADs_all(TADs_all_raw, CDs, bin_size=bin_size)
  106. # Tad_edges <- sort(unique(c(TADs_tmp[,1], (TADs_tmp[,2]+1))))
  107. TADs_all_edges <- sort(unique(TADs_all[,2]))
  108. CD_edges <- sort(unique(CDs[,2]))
  109. TADs_extra_edges = unique(setdiff(TADs_all_edges, CD_edges))
  110. TADs_extra = data.frame(start_pos=TADs_extra_edges, end_pos= TADs_extra_edges)
  111. TADs_pos_all = create_TADs(sub_domains_raw=sub_domains_raw, chr, TADs_all, TADs_type='ALL', bin_size=bin_size)
  112. TADs_pos_extra = create_TADs(sub_domains_raw=sub_domains_raw, chr, TADs_extra, TADs_type='extra', bin_size=bin_size)
  113. TADs_pos_CD = create_TADs(sub_domains_raw=sub_domains_raw, chr, CDs, TADs_type='CD', bin_size=bin_size)
  114. # TADs_info = list(decorated_branches=decorated_branches, TADs_pos_all=TADs_pos_all, TADs_pos_CD=TADs_pos_CD)
  115. TADs_info = list(decorated_branches=decorated_branches, TADs_all=TADs_all, CDs=CDs, TADs_pos_all=TADs_pos_all, TADs_pos_CD=TADs_pos_CD, TADs_pos_extra=TADs_pos_extra)
  116. }
  117. options(stringsAsFactors=FALSE)
  118. level_5 = TADs_info$TADs_pos_extra[, c('chr_name', 'end_pos_ori')]
  119. colnames(level_5)[1:2] = c('chr', 'nested_boundary')
  120. sub_domain_boundary_bed_file = paste0(out_dir, '/chr', chr, '_nested_boundaries.bed')
  121. level_5_bed = data.frame(level_5, level_5[,2], '', '.', level_5, level_5[,2], '#000000')
  122. op <- options(scipen=999)
  123. write.table( level_5_bed, file=sub_domain_boundary_bed_file, quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE )
  124. on.exit(options(op))
  125. return(NULL)
  126. }