calder 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. #!/usr/bin/env Rscript
  2. suppressPackageStartupMessages(library(optparse))
  3. suppressPackageStartupMessages(library(CALDER))
  4. AVAILABLE_REFERENCE_TRACKS_GENOMES <- c("hg19", "hg38", "mm9", "mm10")
  5. INPUT_TYPES <- c("hic", "cool")
  6. CHROMS_TO_REMOVE <- c("ALL", "M", "chrM", "MT", "chrMT", "Y", "chrY")
  7. parse_arguments <- function(){
  8. # Creating the argument parsing options
  9. option_list = list(
  10. make_option(c("-i", "--input"), action="store", default=NA, type='character',
  11. help="Input Hi-C contacts"),
  12. make_option(c("-t", "--type"), action="store", default='hic', type='character',
  13. help="The type of input: hic or cool [default %default]"),
  14. make_option(c("-b", "--bin_size"), action="store", default=50000, type='integer',
  15. help="Bin size to use for the analysis [default %default]"),
  16. make_option(c("-g", "--genome"), action="store", default="hg19", type='character',
  17. help="Genome assembly to use [default %default]"),
  18. make_option(c("-f", "--feature_track"), action="store", default=NA, type='character',
  19. help="Genomic feature track to be used to determine A/B compartment direction
  20. when genome == 'others'. The track should presumably have higher values
  21. in A than in B compartmnets. [default %default]"),
  22. make_option(c("-c", "--chromosomes"), action='store', default='all', type='character',
  23. help="Chromosomes to analyze, separated by comma. [default %default]"),
  24. make_option(c("-p", "--nproc"), action="store", default=1, type='integer',
  25. help="Number of cores to use [default %default]"),
  26. make_option(c("-o", "--outpath"), action="store", default=NA, type='character',
  27. help="Path to the output folder"),
  28. make_option(c("-k", "--keep_intermediate"), action="store_true", default=FALSE, type='logical',
  29. help="Keep intermediate data after done [default %default]")
  30. )
  31. parser <- OptionParser(usage = "%prog [options]", option_list=option_list)
  32. opt <- parse_args(parser)
  33. # Checking if input path exists
  34. if(is.na(opt$input)){
  35. print_help(parser)
  36. stop(paste0("Input path (", opt$input,") does not exist"))
  37. }
  38. # Checking if output path is provided
  39. if(is.na(opt$outpath)){
  40. stop("Output path was not provided")
  41. }
  42. # Check that the input type is one of the possible ones
  43. if(!(opt$type %in% INPUT_TYPES)){
  44. stop(paste0("Input type ", opt$input_type, " not available"))
  45. }
  46. # Check if the provided genome is in the list of available reference genomes
  47. # or if a feature track is provided
  48. if((!(opt$genome %in% AVAILABLE_REFERENCE_TRACKS_GENOMES)) || (file.exists(opt$feature_track))){
  49. # in this case, we just assign it the name 'others'
  50. opt$genome = "others"
  51. }
  52. writeLines(c(
  53. "*******************************",
  54. "* CALDER *",
  55. "*******************************",
  56. paste0("[Parameters] Input: ", opt$input),
  57. paste0("[Parameters] Input type: ", opt$type),
  58. paste0("[Parameters] Bin size: ", opt$bin_size),
  59. paste0("[Parameters] Genome: ", opt$genome),
  60. paste0("[Parameters] Chromosomes: ", opt$chromosomes),
  61. paste0("[Parameters] N. cores: ", opt$nproc),
  62. paste0("[Parameters] Output: ", opt$outpath)
  63. ))
  64. return(opt)
  65. }
  66. sanitize_chroms <- function(chroms){
  67. res <- lapply(chroms, function(x){
  68. if(startsWith(x, "chr")){
  69. return(substring(x, 4))
  70. } else{
  71. return(x)
  72. }
  73. })
  74. return(res)
  75. }
  76. handle_input_hic <- function(opt){
  77. suppressPackageStartupMessages(library(strawr))
  78. chromsizes <- readHicChroms(opt$input)
  79. chroms <- chromsizes[!(toupper(chromsizes$name) %in% toupper(CHROMS_TO_REMOVE)), "name"]
  80. CALDER(contact_file_hic = opt$input,
  81. chrs = chroms,
  82. bin_size = opt$bin_size,
  83. genome = opt$genome,
  84. save_dir=opt$outpath,
  85. save_intermediate_data=TRUE,
  86. single_binsize_only=TRUE,
  87. n_cores = opt$nproc,
  88. sub_domains=TRUE)
  89. }
  90. handle_input_cool <- function(opt){
  91. intermediate_data_dir = file.path(opt$outpath, "intermediate_data")
  92. dir.create(intermediate_data_dir, recursive=TRUE, showWarnings=FALSE)
  93. system(paste0("cooler dump --table chroms --out ",
  94. file.path(intermediate_data_dir, "chroms.txt"),
  95. " --header ",
  96. opt$input))
  97. chroms <- read.table(file.path(intermediate_data_dir, "chroms.txt"), sep="\t", header=TRUE)
  98. if(opt$chromosomes == "all"){
  99. chroms <- chroms[!( toupper(chroms$name) %in% toupper(CHROMS_TO_REMOVE) ), "name"]
  100. }
  101. else{
  102. chrom_list <- strsplit(opt$chromosomes, ",")[[1]]
  103. chroms <- chroms[chroms$name %in% chrom_list, "name"]
  104. }
  105. dump_paths <- list()
  106. for(chrom in chroms){
  107. cat(paste0("[Pre-processing] Dumping ", chrom, "\n"))
  108. chrom_dump_path <- file.path(intermediate_data_dir, paste0(chrom, "_dump.txt"))
  109. dump_paths <- c(dump_paths, chrom_dump_path)
  110. if(! file.exists(chrom_dump_path)){
  111. system(paste0("cooler dump --table pixels --range ",
  112. chrom,
  113. " --join --balanced ",
  114. opt$input,
  115. " | cut -f2,5,8 | awk '{if ($3) print;}' > ",
  116. chrom_dump_path))
  117. }
  118. }
  119. chroms <- sanitize_chroms(chroms)
  120. names(dump_paths) <- chroms
  121. CALDER(contact_file_dump=dump_paths,
  122. chrs=chroms,
  123. bin_size=opt$bin_size,
  124. genome=opt$genome,
  125. save_dir=opt$outpath,,
  126. single_binsize_only=TRUE,
  127. save_intermediate_data=TRUE,
  128. n_cores=opt$nproc,
  129. sub_domains=TRUE)
  130. }
  131. opt <- parse_arguments()
  132. if(opt$type == "hic"){
  133. handle_input_hic(opt)
  134. } else if(opt$type == "cool"){
  135. handle_input_cool(opt)
  136. } else {
  137. stop("Unknown input type")
  138. }
  139. # Cleaning the output
  140. intermediate_data_dir = file.path(opt$outpath, "intermediate_data")
  141. if(dir.exists(intermediate_data_dir) && (!opt$keep_intermediate)){
  142. cat('[Post-processing] Removing intermediate data\n')
  143. unlink(intermediate_data_dir, recursive=TRUE)
  144. }
  145. exec_time_file = "./total_execution.time"
  146. if(file.exists(exec_time_file)){
  147. cat("[Post-processing] Removing total_execution.time\n")
  148. file.remove(exec_time_file)
  149. }