calder 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. #!/usr/bin/env Rscript
  2. suppressPackageStartupMessages(library(optparse))
  3. suppressPackageStartupMessages(library(CALDER))
  4. AVAILABLE_REFERENCE_TRACKS_GENOMES <- c("hg19", "hg38", "mm9", "mm10")
  5. INPUT_TYPES <- c("hic", "cool")
  6. HIC_CHROMS_TO_REMOVE <- c("ALL", "M", "MT", "Y")
  7. COOL_CHROMS_TO_REMOVE <- c("MT", "M", 'chrMT', 'chrM', 'Y', 'chrY')
  8. parse_arguments <- function(){
  9. # Creating the argument parsing options
  10. option_list = list(
  11. make_option(c("-i", "--input"), action="store", default=NA, type='character',
  12. help="Input Hi-C contacts"),
  13. make_option(c("-t", "--type"), action="store", default='hic', type='character',
  14. help="The type of input: hic or cool [default %default]"),
  15. make_option(c("-b", "--bin_size"), action="store", default=50000, type='integer',
  16. help="Bin size to use for the analysis [default %default]"),
  17. make_option(c("-g", "--genome"), action="store", default="hg19", type='character',
  18. help="Genome assembly to use [default %default]"),
  19. make_option(c("-f", "--feature_track"), action="store", default=NA, type='character',
  20. help="Genomic feature track to be used to determine A/B compartment direction
  21. when genome == 'others'. The track should presumably have higher values
  22. in A than in B compartmnets. [default %default]"),
  23. make_option(c("-p", "--nproc"), action="store", default=1, type='integer',
  24. help="Number of cores to use [default %default]"),
  25. make_option(c("-o", "--outpath"), action="store", default=NA, type='character',
  26. help="Path to the output folder"),
  27. make_option(c("-k", "--keep_intermediate"), action="store_true", default=FALSE, type='logical',
  28. help="Keep intermediate data after done [default %default]")
  29. )
  30. parser <- OptionParser(usage = "%prog [options]", option_list=option_list)
  31. opt <- parse_args(parser)
  32. # Checking if input path exists
  33. if(is.na(opt$input)){
  34. print_help(parser)
  35. stop(paste0("Input path (", opt$input,") does not exist"))
  36. }
  37. # Checking if output path is provided
  38. if(is.na(opt$outpath)){
  39. stop("Output path was not provided")
  40. }
  41. # Check that the input type is one of the possible ones
  42. if(!(opt$type %in% INPUT_TYPES)){
  43. stop(paste0("Input type ", opt$input_type, " not available"))
  44. }
  45. # Check if the provided genome is in the list of available reference genomes
  46. # or if a feature track is provided
  47. if((!(opt$genome %in% AVAILABLE_REFERENCE_TRACKS_GENOMES)) || (file.exists(opt$feature_track))){
  48. # in this case, we just assign it the name 'others'
  49. opt$genome = "others"
  50. }
  51. writeLines(c(
  52. "*******************************",
  53. "* CALDER *",
  54. "*******************************",
  55. paste0("[Parameters] Input: ", opt$input),
  56. paste0("[Parameters] Input type: ", opt$type),
  57. paste0("[Parameters] Bin size: ", opt$bin_size),
  58. paste0("[Parameters] Genome: ", opt$genome),
  59. paste0("[Parameters] N. cores: ", opt$nproc),
  60. paste0("[Parameters] Output: ", opt$outpath)
  61. ))
  62. return(opt)
  63. }
  64. sanitize_chroms <- function(chroms){
  65. res <- lapply(chroms, function(x){
  66. if(startsWith(x, "chr")){
  67. return(substring(x, 4))
  68. } else{
  69. return(x)
  70. }
  71. })
  72. return(res)
  73. }
  74. handle_input_hic <- function(opt){
  75. suppressPackageStartupMessages(library(strawr))
  76. chromsizes <- readHicChroms(opt$input)
  77. chroms <- chromsizes[!(chromsizes$name %in% HIC_CHROMS_TO_REMOVE), "name"]
  78. CALDER(contact_file_hic = opt$input,
  79. chrs = chroms,
  80. bin_size = opt$bin_size,
  81. genome = opt$genome,
  82. save_dir=opt$outpath,
  83. save_intermediate_data=TRUE,
  84. single_binsize_only=TRUE,
  85. n_cores = opt$nproc,
  86. sub_domains=TRUE)
  87. }
  88. handle_input_cool <- function(opt){
  89. intermediate_data_dir = file.path(opt$outpath, "intermediate_data")
  90. dir.create(intermediate_data_dir, recursive=TRUE, showWarnings=FALSE)
  91. system(paste0("cooler dump --table chroms --out ",
  92. file.path(intermediate_data_dir, "chroms.txt"),
  93. " --header ",
  94. opt$input))
  95. chroms <- read.table(file.path(intermediate_data_dir, "chroms.txt"), sep="\t", header=TRUE)
  96. chroms <- chroms[!(chroms$name %in% COOL_CHROMS_TO_REMOVE), "name"]
  97. dump_paths <- list()
  98. for(chrom in chroms){
  99. cat(paste0("[Pre-processing] Dumping ", chrom, "\n"))
  100. chrom_dump_path <- file.path(intermediate_data_dir, paste0(chrom, "_dump.txt"))
  101. dump_paths <- c(dump_paths, chrom_dump_path)
  102. if(! file.exists(chrom_dump_path)){
  103. system(paste0("cooler dump --table pixels --range ",
  104. chrom,
  105. " --join --balanced ",
  106. opt$input,
  107. " | cut -f2,5,8 | awk '{if ($3) print;}' > ",
  108. chrom_dump_path))
  109. }
  110. }
  111. chroms <- sanitize_chroms(chroms)
  112. names(dump_paths) <- chroms
  113. CALDER(contact_file_dump=dump_paths,
  114. chrs=chroms,
  115. bin_size=opt$bin_size,
  116. genome=opt$genome,
  117. save_dir=opt$outpath,,
  118. single_binsize_only=TRUE,
  119. save_intermediate_data=TRUE,
  120. n_cores=opt$nproc,
  121. sub_domains=TRUE)
  122. }
  123. opt <- parse_arguments()
  124. if(opt$type == "hic"){
  125. handle_input_hic(opt)
  126. } else if(opt$type == "cool"){
  127. handle_input_cool(opt)
  128. } else {
  129. stop("Unknown input type")
  130. }
  131. # Cleaning the output
  132. intermediate_data_dir = file.path(opt$outpath, "intermediate_data")
  133. if(dir.exists(intermediate_data_dir) && (!opt$keep_intermediate)){
  134. cat('[Post-processing] Removing intermediate data\n')
  135. unlink(intermediate_data_dir, recursive=TRUE)
  136. }
  137. exec_time_file = "./total_execution.time"
  138. if(file.exists(exec_time_file)){
  139. cat("[Post-processing] Removing total_execution.time\n")
  140. file.remove(exec_time_file)
  141. }