build_comp_table_opt.R 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. ## function to obtain the optimal compartment calling from various resolutions (bin_sizes)
  2. build_comp_table_opt = function(save_dir, chrs, bin_sizes, with_ref)
  3. {
  4. `%dopar%` <- foreach::`%dopar%`
  5. `%do%` <- foreach::`%do%`
  6. get_opt_index = function(consist_tab)
  7. {
  8. if(ncol(consist_tab)==2) ## if only one bin_size value
  9. {
  10. opt_index = rep(1, nrow(consist_tab))
  11. return(opt_index)
  12. }
  13. mins = apply(consist_tab[,-1], 1, max) - 0.05 ## allow at most 0.05 smaller than the maximum
  14. opt_index = apply(1*((consist_tab[,-1] >= mins)==1), 1, function(v) min(which(v==1)))
  15. return(opt_index)
  16. }
  17. ################################
  18. identities = 'bulk'
  19. {
  20. consist_tab_li = foreach::foreach(identity=identities) %do%
  21. {
  22. # consist_tab = foreach::foreach(bin_size=bin_sizes, .combine=merge) %do%
  23. # {
  24. # bin_size_kb = sprintf("%skb", bin_size/1E3)
  25. # save_dir_binsize = file.path(save_dir, bin_size_kb)
  26. # cor_log_file = paste0(save_dir_binsize, '/cor_with_ref.txt')
  27. # log_file = paste0(save_dir_binsize, '/chr', chr, '_log.txt')
  28. # as.numeric(strsplit(readLines(log_file)[5], 'this chr is:')[[1]][2])
  29. # cor_tab = data.table::fread(cor_log_file)
  30. # colnames(cor_tab) = c("chr", bin_size_kb)
  31. # cor_tab
  32. # }
  33. consist_tab = foreach::foreach(bin_size=bin_sizes, .combine=merge) %do%
  34. {
  35. bin_size_kb = sprintf("%skb", bin_size/1E3)
  36. save_dir_binsize = file.path(save_dir, 'intermediate_data/sub_compartments', bin_size_kb)
  37. consist_tab_tmp = data.table::data.table(chr=paste0('chr', chrs), val=0)
  38. consist_tab_tmp$val = foreach::foreach(chr=chrs, .combine=c) %do%
  39. {
  40. log_file = paste0(save_dir_binsize, '/chr', chr, '_log.txt')
  41. # print(log_file)
  42. cor_val = as.numeric(strsplit(readLines(log_file)[5], 'this chr is:')[[1]][2])
  43. # print(cor_val)
  44. cor_val
  45. }
  46. colnames(consist_tab_tmp)[2] = bin_size_kb
  47. consist_tab_tmp
  48. }
  49. s = gsub('chr', '', consist_tab[['chr']])
  50. x <- suppressWarnings(as.numeric(s)) ## https://stackoverflow.com/questions/70080294/sort-column-in-r-strings-first-alphabetically-then-numbers-numerically
  51. consist_tab = consist_tab[order(x, 'is.na<-'(s, !is.na(x))), ]
  52. # print((consist_tab))
  53. }
  54. names(consist_tab_li) = identities
  55. min_consist = (sapply(consist_tab_li, function(v) min(v[,2])))
  56. ################################ Choose the best bin size (as small as possible, and save the chosen comp to the opt dir)
  57. # silent_out = foreach::foreach(identity=identities, .combine=merge) %do% ## do not use dopar
  58. {
  59. save_dir_opt = file.path(save_dir, "sub_compartments")
  60. dir.create(save_dir_opt, recursive=TRUE, showWarnings=FALSE)
  61. consist_tab = consist_tab_li[[identity]]
  62. opt_index = get_opt_index(consist_tab)
  63. names(opt_index) = gsub(":", "", consist_tab$chr)
  64. opt_bin_tab = data.table::data.table(chr=names(opt_index), opt_binsize=(colnames(consist_tab)[-1])[opt_index])
  65. consist_tab_tmp = cbind( consist_tab, opt_bin_tab[, 'opt_binsize'])
  66. colnames(consist_tab_tmp)[ncol(consist_tab_tmp)] = 'opt_binsize'
  67. # consist_tab_tmp$chr_num = gsub(':', '', gsub("chr", "", consist_tab_tmp$chr))
  68. # consist_tab_tmp[chr=="chrX:"]$chr_num = 23
  69. # consist_tab_tmp = consist_tab_tmp[order(chr_num)]
  70. cor_all_file = file.path(save_dir_opt, "cor_with_ref.ALL.txt")
  71. data.table::fwrite(consist_tab_tmp, file=cor_all_file, col.names=TRUE, sep="\t")
  72. cor_with_ref = foreach::foreach(j=1:nrow(opt_bin_tab), .combine=c) %do%
  73. {
  74. chr_query = opt_bin_tab$chr[j]
  75. opt_binsize = opt_bin_tab$opt_binsize[j]
  76. row_index = which(consist_tab$chr == chr_query)
  77. col_index = which(colnames(consist_tab)==opt_binsize)
  78. as.data.frame(consist_tab)[row_index, col_index] ## some wired behavior when using data.table. Convert to data.frame
  79. }
  80. cor_with_ref_tab = data.table::data.table(chr=consist_tab$chr, cor=cor_with_ref, chr_num=gsub("chr", "", opt_bin_tab$chr))
  81. s = cor_with_ref_tab$chr_num
  82. x <- suppressWarnings(as.numeric(s)) ## https://stackoverflow.com/questions/70080294/sort-column-in-r-strings-first-alphabetically-then-numbers-numerically
  83. cor_with_ref_tab = cor_with_ref_tab[order(x, 'is.na<-'(s, !is.na(x))), ]
  84. cor_opt_file = file.path(save_dir_opt, "cor_with_ref.txt")
  85. data.table::fwrite(cor_with_ref_tab[, 1:2], file=cor_opt_file, col.names=TRUE, sep="\t")
  86. ################################ make plot
  87. cor_with_ref_tab$index = 1:nrow(cor_with_ref_tab)
  88. cor_opt_plot_file = file.path(save_dir_opt, "cor_with_ref.pdf")
  89. pdf(cor_opt_plot_file, width=8, height=6)
  90. p = ggplot2::ggplot(data=cor_with_ref_tab, ggplot2::aes(x=index, y=cor, label = format(round(cor_with_ref_tab$cor, 2), nsmall = 2))) + ggplot2::geom_line(color="red") + ggplot2::geom_point()
  91. p = p + ggplot2::geom_label() + ggplot2::scale_x_continuous(labels=cor_with_ref_tab$chr_num, breaks=1:nrow(cor_with_ref_tab))
  92. p = p + ggplot2::geom_hline(yintercept=ifelse(with_ref, 0.4, 0.15), linetype=2, color='darkblue')
  93. if(with_ref) p = p + ggplot2::xlab('chr') + ggplot2::ylab('Rho') + ggplot2::ggtitle('Correlation of compartment rank with reference\nRho < 0.4 indicates imprecise compartment calling')
  94. if(!with_ref) p = p + ggplot2::xlab('chr') + ggplot2::ylab('Rho') + ggplot2::ggtitle('Correlation of compartment rank with reference\nRho < 0.15 indicates imprecise compartment calling')
  95. print(p)
  96. dev.off()
  97. ################################
  98. opt_sub_comp_beds = foreach::foreach(i=1:nrow(opt_bin_tab)) %do%
  99. {
  100. opt_bin_size_kb = opt_bin_tab[[2]][i]
  101. chr_name = opt_bin_tab[[1]][i]
  102. opt_sub_comp_bed_chr = sprintf("%s/%s_sub_compartments.bed", file.path(save_dir, 'intermediate_data/sub_compartments', opt_bin_size_kb), chr_name)
  103. opt_sub_comp_bed_chr
  104. }
  105. # save_opt_file = file.path(save_dir_opt, "all_sub_compartments.bed")
  106. save_opt_file = file.path(save_dir_opt, sprintf("all_sub_compartments.bed"))
  107. cmd_opt = sprintf("cat %s > %s", paste0(opt_sub_comp_beds, collapse=" "), save_opt_file)
  108. system(cmd_opt)
  109. # cmd_replaceX = sprintf("sed -i 's/chrNA/chrX/g' %s", save_opt_file) ## the bed file contains NA because of chrX
  110. # system(cmd_replaceX)
  111. comp_raw = data.table::fread(save_opt_file)
  112. comp_tab = make_comp_tab(comp_raw, bin_size=10E3)
  113. cols2keep = c('chr', 'pos_start', 'pos_end', 'comp_name', 'comp_rank', 'continous_rank')
  114. comp_tab = comp_tab[, c('chr', 'pos_start', 'pos_end', 'comp_name', 'comp_rank', 'continous_rank')]
  115. # comp_tab = comp_tab[, mget(cols2keep)] ## mget does not work for some reasons
  116. save_opt_tab_file = file.path(save_dir_opt, sprintf("all_sub_compartments.tsv"))
  117. data.table::fwrite(comp_tab, file=save_opt_tab_file, sep='\t')
  118. }
  119. }
  120. }