Jelajahi Sumber

command line utility

lucananni93 2 tahun lalu
induk
melakukan
29cad82ee3
1 mengubah file dengan 164 tambahan dan 0 penghapusan
  1. 164 0
      scripts/calder

+ 164 - 0
scripts/calder

@@ -0,0 +1,164 @@
+#!/usr/bin/env Rscript
+
+suppressPackageStartupMessages(library(optparse))
+suppressPackageStartupMessages(library(CALDER))
+
+AVAILABLE_REFERENCE_TRACKS_GENOMES <- c("hg19", "hg38", "mm9", "mm10")
+INPUT_TYPES <- c("hic", "cool")
+HIC_CHROMS_TO_REMOVE <- c("ALL", "M", "MT", "Y")
+COOL_CHROMS_TO_REMOVE <- c("MT", "M", 'chrMT', 'chrM', 'Y', 'chrY')
+
+
+parse_arguments <- function(){
+	# Creating the argument parsing options
+	option_list = list(
+	  make_option(c("-i", "--input"), action="store", default=NA, type='character',
+	              help="Input Hi-C contacts"),
+	  make_option(c("-t", "--type"), action="store", default='hic', type='character',
+	              help="The type of input: hic or cool [default %default]"),
+	  make_option(c("-b", "--bin_size"), action="store", default=50000, type='integer',
+	              help="Bin size to use for the analysis [default %default]"),
+	  make_option(c("-g", "--genome"), action="store", default="hg19", type='character',
+	              help="Genome assembly to use [default %default]"),
+	  make_option(c("-f", "--feature_track"), action="store", default=NA, type='character',
+	              help="Genomic feature track to be used to determine A/B compartment direction
+	              when genome == 'others'. The track should presumably have higher values
+	              in A than in B compartmnets. [default %default]"),
+	  make_option(c("-p", "--nproc"), action="store", default=1, type='integer',
+	              help="Number of cores to use [default %default]"),
+	  make_option(c("-o", "--outpath"), action="store", default=NA, type='character',
+	              help="Path to the output folder"),
+	  make_option(c("-k", "--keep_intermediate"), action="store_true", default=FALSE, type='logical',
+	              help="Keep intermediate data after done [default %default]")
+	)
+	parser <- OptionParser(usage = "%prog [options]", option_list=option_list)
+	opt <- parse_args(parser)
+
+	# Checking if input path exists
+	if(is.na(opt$input)){
+		print_help(parser)
+		stop(paste0("Input path (", opt$input,") does not exist"))
+	}
+
+	# Checking if output path is provided
+	if(is.na(opt$outpath)){
+		stop("Output path was not provided")
+	}
+
+	# Check that the input type is one of the possible ones
+	if(!(opt$type %in% INPUT_TYPES)){
+		stop(paste0("Input type ", opt$input_type, " not available"))
+	}
+
+	# Check if the provided genome is in the list of available reference genomes 
+	# or if a feature track is provided
+	if((!(opt$genome %in% AVAILABLE_REFERENCE_TRACKS_GENOMES)) || (file.exists(opt$feature_track))){
+		# in this case, we just assign it the name 'others'
+		opt$genome = "others"
+	}
+
+	writeLines(c(
+		"*******************************",
+		"*            CALDER           *",
+		"*******************************",
+		paste0("[Parameters] Input: ", opt$input),
+		paste0("[Parameters] Input type: ", opt$type),
+		paste0("[Parameters] Bin size: ", opt$bin_size),
+		paste0("[Parameters] Genome: ", opt$genome),
+		paste0("[Parameters] N. cores: ", opt$nproc),
+		paste0("[Parameters] Output: ", opt$outpath)
+	))
+
+	return(opt)
+}
+
+sanitize_chroms <- function(chroms){
+	res <- lapply(chroms, function(x){
+		if(startsWith(x, "chr")){
+			return(substring(x, 4))
+		} else{
+			return(x)
+		}
+	})
+	return(res)
+}
+
+handle_input_hic <- function(opt){
+	suppressPackageStartupMessages(library(strawr))
+	chromsizes <- readHicChroms(opt$input)
+	chroms <- chromsizes[!(chromsizes$name %in% HIC_CHROMS_TO_REMOVE), "name"]
+	CALDER(contact_file_hic = opt$input,
+		   chrs = chroms,
+		   bin_size = opt$bin_size,
+		   genome = opt$genome,
+		   save_dir=opt$outpath,
+		   save_intermediate_data=TRUE,
+		   single_binsize_only=TRUE,
+		   n_cores = opt$nproc,
+		   sub_domains=TRUE)
+}
+
+handle_input_cool <- function(opt){
+	intermediate_data_dir = file.path(opt$outpath, "intermediate_data")
+	dir.create(intermediate_data_dir, recursive=TRUE, showWarnings=FALSE)
+
+	system(paste0("cooler dump --table chroms --out ", 
+				  file.path(intermediate_data_dir, "chroms.txt"), 
+				  " --header ", 
+				  opt$input))
+	chroms <- read.table(file.path(intermediate_data_dir, "chroms.txt"), sep="\t", header=TRUE)
+	chroms <- chroms[!(chroms$name %in% COOL_CHROMS_TO_REMOVE), "name"]
+
+	dump_paths <- list()
+	for(chrom in chroms){
+		cat(paste0("[Pre-processing] Dumping ", chrom, "\n"))
+		chrom_dump_path <- file.path(intermediate_data_dir, paste0(chrom, "_dump.txt"))
+		dump_paths <- c(dump_paths, chrom_dump_path)
+		if(! file.exists(chrom_dump_path)){
+			system(paste0("cooler dump --table pixels --range ", 
+						  chrom, 
+						  " --join --balanced ",
+						  opt$input,
+						  " | cut -f2,5,8 | awk '{if ($3) print;}' > ",
+						  chrom_dump_path))
+		}
+	}
+
+	chroms <- sanitize_chroms(chroms)
+	names(dump_paths) <- chroms
+
+	CALDER(contact_file_dump=dump_paths, 
+		   chrs=chroms, 
+		   bin_size=opt$bin_size,
+		   genome=opt$genome,
+		   save_dir=opt$outpath,,
+		   single_binsize_only=TRUE,
+		   save_intermediate_data=TRUE,
+		   n_cores=opt$nproc,
+		   sub_domains=TRUE)
+
+}
+
+opt <- parse_arguments()
+
+
+if(opt$type == "hic"){
+	handle_input_hic(opt)
+} else if(opt$type == "cool"){
+	handle_input_cool(opt)
+} else {
+	stop("Unknown input type")
+}
+
+# Cleaning the output
+intermediate_data_dir = file.path(opt$outpath, "intermediate_data")
+if(dir.exists(intermediate_data_dir) && (!opt$keep_intermediate)){
+	cat('[Post-processing] Removing intermediate data\n')
+	unlink(intermediate_data_dir, recursive=TRUE)
+}
+exec_time_file = "./total_execution.time"
+if(file.exists(exec_time_file)){
+	cat("[Post-processing] Removing total_execution.time\n")
+	file.remove(exec_time_file)
+}
+