calder 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. #!/usr/bin/env Rscript
  2. ###########################################################################
  3. # 该程序修改自 https://github.com/CSOgroup/CALDER2/blob/main/scripts/calder
  4. # 1. 源程序无法正确处理染色体名称不以 chr 为前缀的情况
  5. # 2. 只修改了 cool 格式作为输入文件,hic 格式预计仍然有 bug 未测试
  6. # 3. 如果染色体前缀不一致,无法处理
  7. # 4. 增加了对 mcool 格式的兼容
  8. # 修改人:张旭东 zhangxudong@genek.cn
  9. ###########################################################################
  10. suppressPackageStartupMessages(library(optparse))
  11. suppressPackageStartupMessages(library(CALDER))
  12. AVAILABLE_REFERENCE_TRACKS_GENOMES <- c("hg19", "hg38", "mm9", "mm10")
  13. INPUT_TYPES <- c("hic", "cool", "mcool")
  14. CHROMS_TO_REMOVE <- c("ALL", "M", "chrM", "MT", "chrMT", "Y", "chrY")
  15. parse_arguments <- function(){
  16. # Creating the argument parsing options
  17. option_list = list(
  18. make_option(c("-i", "--input"), action="store", default=NA, type='character',
  19. help="Input Hi-C contacts"),
  20. # 增加对 mcool 格式的兼容
  21. make_option(c("-t", "--type"), action="store", default='hic', type='character',
  22. help="The type of input: hic or cool or mcool [default %default]"),
  23. make_option(c("-b", "--bin_size"), action="store", default=50000, type='integer',
  24. help="Bin size to use for the analysis [default %default]"),
  25. make_option(c("-g", "--genome"), action="store", default="hg19", type='character',
  26. help="Genome assembly to use [default %default]"),
  27. make_option(c("-f", "--feature_track"), action="store", default=NA, type='character',
  28. help="Genomic feature track to be used to determine A/B compartment direction
  29. when genome == 'others'. The track should presumably have higher values
  30. in A than in B compartmnets. [default %default]"),
  31. # 增加了原染色体前缀参数
  32. make_option(c("--chr_prefix"), action="store", default="chr", type='character',
  33. help="old chr prefix [default %default]"),
  34. make_option(c("-c", "--chromosomes"), action='store', default='all', type='character',
  35. help="Chromosomes to analyze, separated by comma. [default %default]"),
  36. make_option(c("-p", "--nproc"), action="store", default=1, type='integer',
  37. help="Number of cores to use [default %default]"),
  38. make_option(c("-o", "--outpath"), action="store", default=NA, type='character',
  39. help="Path to the output folder"),
  40. make_option(c("-k", "--keep_intermediate"), action="store_true", default=FALSE, type='logical',
  41. help="Keep intermediate data after done [default %default]"),
  42. make_option(c("-a", "--adaptive"), action="store_true", default=FALSE, type='logical',
  43. help="Use adaptive resolution choice [default %default]")
  44. )
  45. parser <- OptionParser(usage = "%prog [options]", option_list=option_list)
  46. opt <- parse_args(parser)
  47. # Checking if input path exists
  48. if(is.na(opt$input)){
  49. print_help(parser)
  50. stop(paste0("Input path (", opt$input,") does not exist"))
  51. }
  52. # Checking if output path is provided
  53. if(is.na(opt$outpath)){
  54. stop("Output path was not provided")
  55. }
  56. # Check that the input type is one of the possible ones
  57. if(!(opt$type %in% INPUT_TYPES)){
  58. stop(paste0("Input type ", opt$input_type, " not available"))
  59. }
  60. # Check if the provided genome is in the list of available reference genomes
  61. # or if a feature track is provided
  62. if((!(opt$genome %in% AVAILABLE_REFERENCE_TRACKS_GENOMES)) || (file.exists(opt$feature_track))){
  63. # in this case, we just assign it the name 'others'
  64. opt$genome = "others"
  65. }
  66. writeLines(c(
  67. "*******************************",
  68. "* CALDER *",
  69. "*******************************",
  70. paste0("[Parameters] Input: ", opt$input),
  71. paste0("[Parameters] Input type: ", opt$type),
  72. paste0("[Parameters] Bin size: ", opt$bin_size),
  73. paste0("[Parameters] Genome: ", opt$genome),
  74. paste0("[Parameters] Feature Track: ", opt$feature_track),
  75. paste0("[Parameters] Chr Prefix: ", opt$chr_prefix),
  76. paste0("[Parameters] Chromosomes: ", opt$chromosomes),
  77. paste0("[Parameters] N. cores: ", opt$nproc),
  78. paste0("[Parameters] Output: ", opt$outpath),
  79. paste0("[Parameters] Keep Intermediate data: ", opt$keep_intermediate),
  80. paste0("[Parameters] Use adaptive resolution: ", opt$adaptive)
  81. ))
  82. if(file.exists(opt$feature_track)){
  83. opt$feature_track <- read.table(opt$feature_track, header = F)
  84. }
  85. return(opt)
  86. }
  87. sanitize_chroms <- function(chroms){
  88. res <- lapply(chroms, function(x){
  89. if(startsWith(x, opt$chr_prefix)){
  90. return(substring(x, 4))
  91. } else{
  92. return(x)
  93. }
  94. })
  95. return(res)
  96. }
  97. handle_input_hic <- function(opt){
  98. suppressPackageStartupMessages(library(strawr))
  99. chromsizes <- readHicChroms(opt$input)
  100. if(opt$chromosomes == "all"){
  101. chroms <- chromsizes[!(toupper(chromsizes$name) %in% toupper(CHROMS_TO_REMOVE)), "name"]
  102. }
  103. else{
  104. chrom_list <- strsplit(opt$chromosomes, ",")[[1]]
  105. chroms <- chromsizes[chromsizes$name %in% chrom_list, "name"]
  106. }
  107. chroms <- sanitize_chroms(chroms)
  108. CALDER(contact_file_hic = opt$input,
  109. chrs = chroms,
  110. bin_size = opt$bin_size,
  111. genome = opt$genome,
  112. save_dir=opt$outpath,
  113. save_intermediate_data=TRUE,
  114. feature_track=opt$feature_track,
  115. single_binsize_only=!opt$adaptive,
  116. n_cores = opt$nproc,
  117. sub_domains=T)
  118. }
  119. # 增加了对 mcool 的支持
  120. handle_input_mcool <- function(opt){
  121. intermediate_data_dir = file.path(opt$outpath, "intermediate_data")
  122. dir.create(intermediate_data_dir, recursive=TRUE, showWarnings=FALSE)
  123. opt$feature_track[, 1] = gsub(paste0("^", opt$chr_prefix), "chr", opt$feature_track[, 1])
  124. system(paste0("cooler dump --table chroms --out ",
  125. file.path(intermediate_data_dir, "chroms.txt"),
  126. " --header ",
  127. opt$input,
  128. "::/resolutions/",
  129. opt$bin_size
  130. ))
  131. chroms <- read.table(file.path(intermediate_data_dir, "chroms.txt"), sep="\t", header=TRUE)
  132. if(opt$chromosomes == "all"){
  133. chroms <- chroms[!( toupper(chroms$name) %in% toupper(CHROMS_TO_REMOVE) ), "name"]
  134. }
  135. else{
  136. chrom_list <- strsplit(opt$chromosomes, ",")[[1]]
  137. chroms <- chroms[chroms$name %in% chrom_list, "name"]
  138. }
  139. dump_paths <- list()
  140. for(chrom in chroms){
  141. cat(paste0("[Pre-processing] Dumping ", chrom, "\n"))
  142. chrom_dump_path <- file.path(intermediate_data_dir, paste0(chrom, "_dump.txt"))
  143. dump_paths <- c(dump_paths, chrom_dump_path)
  144. if(! file.exists(chrom_dump_path)){
  145. system(paste0("cooler dump --table pixels --range ",
  146. chrom,
  147. " --join --balanced ",
  148. opt$input,
  149. "::/resolutions/",
  150. opt$bin_size,
  151. " | cut -f2,5,8 | awk '{if ($3) print;}' > ",
  152. chrom_dump_path))
  153. }
  154. }
  155. chroms <- sanitize_chroms(chroms)
  156. names(dump_paths) <- chroms
  157. CALDER(contact_file_dump=dump_paths,
  158. chrs=chroms,
  159. bin_size=opt$bin_size,
  160. genome=opt$genome,
  161. save_dir=opt$outpath,
  162. feature_track=opt$feature_track,
  163. single_binsize_only=!opt$adaptive,
  164. save_intermediate_data=TRUE,
  165. n_cores=opt$nproc,
  166. sub_domains=T)
  167. }
  168. handle_input_cool <- function(opt){
  169. intermediate_data_dir = file.path(opt$outpath, "intermediate_data")
  170. dir.create(intermediate_data_dir, recursive=TRUE, showWarnings=FALSE)
  171. # 处理染色体前缀
  172. opt$feature_track[, 1] = gsub(paste0("^", opt$chr_prefix), "chr", opt$feature_track[, 1])
  173. system(paste0("cooler dump --table chroms --out ",
  174. file.path(intermediate_data_dir, "chroms.txt"),
  175. " --header ",
  176. opt$input
  177. ))
  178. chroms <- read.table(file.path(intermediate_data_dir, "chroms.txt"), sep="\t", header=TRUE)
  179. if(opt$chromosomes == "all"){
  180. chroms <- chroms[!( toupper(chroms$name) %in% toupper(CHROMS_TO_REMOVE) ), "name"]
  181. }
  182. else{
  183. chrom_list <- strsplit(opt$chromosomes, ",")[[1]]
  184. chroms <- chroms[chroms$name %in% chrom_list, "name"]
  185. }
  186. dump_paths <- list()
  187. for(chrom in chroms){
  188. cat(paste0("[Pre-processing] Dumping ", chrom, "\n"))
  189. chrom_dump_path <- file.path(intermediate_data_dir, paste0(chrom, "_dump.txt"))
  190. dump_paths <- c(dump_paths, chrom_dump_path)
  191. if(! file.exists(chrom_dump_path)){
  192. system(paste0("cooler dump --table pixels --range ",
  193. chrom,
  194. " --join --balanced ",
  195. opt$input,
  196. " | cut -f2,5,8 | awk '{if ($3) print;}' > ",
  197. chrom_dump_path))
  198. }
  199. }
  200. chroms <- sanitize_chroms(chroms)
  201. names(dump_paths) <- chroms
  202. CALDER(contact_file_dump=dump_paths,
  203. chrs=chroms,
  204. bin_size=opt$bin_size,
  205. genome=opt$genome,
  206. save_dir=opt$outpath,
  207. feature_track=opt$feature_track,
  208. single_binsize_only=!opt$adaptive,
  209. save_intermediate_data=TRUE,
  210. n_cores=opt$nproc,
  211. sub_domains=T)
  212. }
  213. opt <- parse_arguments()
  214. if(opt$type == "hic"){
  215. handle_input_hic(opt)
  216. } else if(opt$type == "mcool"){
  217. handle_input_mcool(opt)
  218. } else if(opt$type == "cool"){
  219. handle_input_cool(opt)
  220. } else {
  221. stop("Unknown input type")
  222. }
  223. # Cleaning the output
  224. intermediate_data_dir = file.path(opt$outpath, "intermediate_data")
  225. if(dir.exists(intermediate_data_dir) && (!opt$keep_intermediate)){
  226. cat('[Post-processing] Removing intermediate data\n')
  227. unlink(intermediate_data_dir, recursive=TRUE)
  228. }
  229. exec_time_file = "./total_execution.time"
  230. if(file.exists(exec_time_file)){
  231. cat("[Post-processing] Removing total_execution.time\n")
  232. file.remove(exec_time_file)
  233. }
  234. # 将 chr 再重新替换为原来的前缀
  235. replace_chr_in_files <- function(dir_path, old_prefix, new_prefix) {
  236. # 定义常见的文本文件扩展名
  237. text_file_extensions <- c("txt", "tsv", "csv", "bed")
  238. # 获取目录中的所有文件和子目录
  239. files <- list.files(dir_path, recursive = TRUE, full.names = TRUE)
  240. for (file in files) {
  241. # 获取文件扩展名
  242. file_ext <- tools::file_ext(file)
  243. # 检查文件是否为文本文件
  244. if (file.info(file)$isdir == FALSE && file_ext %in% text_file_extensions) {
  245. tryCatch({
  246. # 读取文件内容
  247. content <- readLines(file)
  248. # 替换行首的 'chr' 为 'Chr'
  249. content <- gsub(paste0("^", old_prefix), new_prefix, content)
  250. # 写回文件
  251. writeLines(content, file)
  252. cat("Processed:", file, "\n")
  253. }, error = function(e) {
  254. cat("Failed to process:", file, "\n", e, "\n")
  255. })
  256. }
  257. }
  258. }
  259. # 示例用法
  260. replace_chr_in_files(opt$outpath, "^chr", opt$chr_prefix)