build_comp_table_opt.R 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. build_comp_table_opt = function(save_dir, chrs, bin_sizes, with_ref) ## function to obtain the optimal compartment calling from various resolutions (= bin_sizes)
  2. {
  3. `%dopar%` <- foreach::`%dopar%`
  4. `%do%` <- foreach::`%do%`
  5. get_opt_index = function(consist_tab)
  6. {
  7. if(ncol(consist_tab)==2) ## if only one bin_size value
  8. {
  9. opt_index = rep(1, nrow(consist_tab))
  10. return(opt_index)
  11. }
  12. mins = apply(consist_tab[,-1], 1, max) - 0.05 ## allow at most 0.05 smaller than the maximum
  13. opt_index = apply(1*((consist_tab[,-1] >= mins)==1), 1, function(v) min(which(v==1)))
  14. return(opt_index)
  15. }
  16. ################################
  17. identities = 'bulk'
  18. {
  19. consist_tab_li = foreach::foreach(identity=identities) %do%
  20. {
  21. # consist_tab = foreach::foreach(bin_size=bin_sizes, .combine=merge) %do%
  22. # {
  23. # bin_size_kb = sprintf("%skb", bin_size/1E3)
  24. # save_dir_binsize = file.path(save_dir, bin_size_kb)
  25. # cor_log_file = paste0(save_dir_binsize, '/cor_with_ref.txt')
  26. # log_file = paste0(save_dir_binsize, '/chr', chr, '_log.txt')
  27. # as.numeric(strsplit(readLines(log_file)[5], 'this chr is:')[[1]][2])
  28. # cor_tab = data.table::fread(cor_log_file)
  29. # colnames(cor_tab) = c("chr", bin_size_kb)
  30. # cor_tab
  31. # }
  32. consist_tab = foreach::foreach(bin_size=bin_sizes, .combine=merge) %do%
  33. {
  34. bin_size_kb = sprintf("%skb", bin_size/1E3)
  35. save_dir_binsize = file.path(save_dir, 'intermediate_data/sub_compartments', bin_size_kb)
  36. consist_tab_tmp = data.table::data.table(chr=paste0('chr', chrs), val=0)
  37. consist_tab_tmp$val = foreach::foreach(chr=chrs, .combine=c) %do%
  38. {
  39. log_file = paste0(save_dir_binsize, '/chr', chr, '_log.txt')
  40. # print(log_file)
  41. cor_val = as.numeric(strsplit(readLines(log_file)[5], 'this chr is:')[[1]][2])
  42. # print(cor_val)
  43. cor_val
  44. }
  45. colnames(consist_tab_tmp)[2] = bin_size_kb
  46. consist_tab_tmp
  47. }
  48. s = gsub('chr', '', consist_tab[['chr']])
  49. x <- suppressWarnings(as.numeric(s)) ## https://stackoverflow.com/questions/70080294/sort-column-in-r-strings-first-alphabetically-then-numbers-numerically
  50. consist_tab = consist_tab[order(x, 'is.na<-'(s, !is.na(x))), ]
  51. # print((consist_tab))
  52. }
  53. names(consist_tab_li) = identities
  54. min_consist = (sapply(consist_tab_li, function(v) min(v[,2])))
  55. ################################ Choose the best bin size (as small as possible, and save the chosen comp to the opt dir)
  56. # silent_out = foreach::foreach(identity=identities, .combine=merge) %do% ## do not use dopar
  57. {
  58. save_dir_opt = file.path(save_dir, "sub_compartments")
  59. dir.create(save_dir_opt, recursive=TRUE, showWarnings=FALSE)
  60. consist_tab = consist_tab_li[[identity]]
  61. opt_index = get_opt_index(consist_tab)
  62. names(opt_index) = gsub(":", "", consist_tab$chr)
  63. opt_bin_tab = data.table::data.table(chr=names(opt_index), opt_binsize=(colnames(consist_tab)[-1])[opt_index])
  64. consist_tab_tmp = cbind( consist_tab, opt_bin_tab[, 'opt_binsize'])
  65. colnames(consist_tab_tmp)[ncol(consist_tab_tmp)] = 'opt_binsize'
  66. # consist_tab_tmp$chr_num = gsub(':', '', gsub("chr", "", consist_tab_tmp$chr))
  67. # consist_tab_tmp[chr=="chrX:"]$chr_num = 23
  68. # consist_tab_tmp = consist_tab_tmp[order(chr_num)]
  69. cor_all_file = file.path(save_dir_opt, "cor_with_ref.ALL.txt")
  70. data.table::fwrite(consist_tab_tmp, file=cor_all_file, col.names=TRUE, sep="\t")
  71. cor_with_ref = foreach::foreach(j=1:nrow(opt_bin_tab), .combine=c) %do%
  72. {
  73. chr_query = opt_bin_tab$chr[j]
  74. opt_binsize = opt_bin_tab$opt_binsize[j]
  75. row_index = which(consist_tab$chr == chr_query)
  76. col_index = which(colnames(consist_tab)==opt_binsize)
  77. as.data.frame(consist_tab)[row_index, col_index] ## some wired behavior when using data.table. Convert to data.frame
  78. }
  79. cor_with_ref_tab = data.table::data.table(chr=consist_tab$chr, cor=cor_with_ref, chr_num=gsub("chr", "", opt_bin_tab$chr))
  80. s = cor_with_ref_tab$chr_num
  81. x <- suppressWarnings(as.numeric(s)) ## https://stackoverflow.com/questions/70080294/sort-column-in-r-strings-first-alphabetically-then-numbers-numerically
  82. cor_with_ref_tab = cor_with_ref_tab[order(x, 'is.na<-'(s, !is.na(x))), ]
  83. cor_opt_file = file.path(save_dir_opt, "cor_with_ref.txt")
  84. data.table::fwrite(cor_with_ref_tab[, 1:2], file=cor_opt_file, col.names=TRUE, sep="\t")
  85. ################################ make plot
  86. cor_with_ref_tab$index = 1:nrow(cor_with_ref_tab)
  87. cor_opt_plot_file = file.path(save_dir_opt, "cor_with_ref.pdf")
  88. pdf(cor_opt_plot_file, width=8, height=6)
  89. p = ggplot2::ggplot(data=cor_with_ref_tab, ggplot2::aes(x=index, y=cor, label = format(round(cor_with_ref_tab$cor, 2), nsmall = 2))) + ggplot2::geom_line(color="red") + ggplot2::geom_point()
  90. p = p + ggplot2::geom_label() + ggplot2::scale_x_continuous(labels=cor_with_ref_tab$chr_num, breaks=1:nrow(cor_with_ref_tab))
  91. p = p + ggplot2::geom_hline(yintercept=ifelse(with_ref, 0.4, 0.15), linetype=2, color='darkblue')
  92. if(with_ref) p = p + ggplot2::xlab('chr') + ggplot2::ylab('Rho') + ggplot2::ggtitle('Correlation of compartment rank with reference\nRho < 0.4 indicates imprecise compartment calling')
  93. if(!with_ref) p = p + ggplot2::xlab('chr') + ggplot2::ylab('Rho') + ggplot2::ggtitle('Correlation of compartment rank with reference\nRho < 0.15 indicates imprecise compartment calling')
  94. print(p)
  95. dev.off()
  96. ################################
  97. opt_sub_comp_beds = foreach::foreach(i=1:nrow(opt_bin_tab)) %do%
  98. {
  99. opt_bin_size_kb = opt_bin_tab[[2]][i]
  100. chr_name = opt_bin_tab[[1]][i]
  101. opt_sub_comp_bed_chr = sprintf("%s/%s_sub_compartments.bed", file.path(save_dir, 'intermediate_data/sub_compartments', opt_bin_size_kb), chr_name)
  102. opt_sub_comp_bed_chr
  103. }
  104. # save_opt_file = file.path(save_dir_opt, "all_sub_compartments.bed")
  105. save_opt_file = file.path(save_dir_opt, sprintf("all_sub_compartments.bed"))
  106. cmd_opt = sprintf("cat %s > %s", paste0(opt_sub_comp_beds, collapse=" "), save_opt_file)
  107. system(cmd_opt)
  108. # cmd_replaceX = sprintf("sed -i 's/chrNA/chrX/g' %s", save_opt_file) ## the bed file contains NA because of chrX
  109. # system(cmd_replaceX)
  110. comp_raw = data.table::fread(save_opt_file)
  111. comp_tab = make_comp_tab(comp_raw, bin_size=10E3)
  112. cols2keep = c('chr', 'pos_start', 'pos_end', 'comp_name', 'comp_rank', 'continous_rank')
  113. comp_tab = comp_tab[, c('chr', 'pos_start', 'pos_end', 'comp_name', 'comp_rank', 'continous_rank')]
  114. # comp_tab = comp_tab[, mget(cols2keep)] ## mget does not work for some reasons
  115. save_opt_tab_file = file.path(save_dir_opt, sprintf("all_sub_compartments.tsv"))
  116. data.table::fwrite(comp_tab, file=save_opt_tab_file, sep='\t')
  117. }
  118. }
  119. }