#!/usr/bin/env Rscript ########################################################################### # 该程序修改自 https://github.com/CSOgroup/CALDER2/blob/main/scripts/calder # 1. 源程序无法正确处理染色体名称不以 chr 为前缀的情况 # 2. 只修改了 cool 格式作为输入文件,hic 格式预计仍然有 bug 未测试 # 3. 如果染色体前缀不一致,无法处理 # 4. 增加了对 mcool 格式的兼容 # 修改人:张旭东 zhangxudong@genek.cn ########################################################################### suppressPackageStartupMessages(library(optparse)) suppressPackageStartupMessages(library(CALDER)) AVAILABLE_REFERENCE_TRACKS_GENOMES <- c("hg19", "hg38", "mm9", "mm10") INPUT_TYPES <- c("hic", "cool", "mcool") CHROMS_TO_REMOVE <- c("ALL", "M", "chrM", "MT", "chrMT", "Y", "chrY") parse_arguments <- function(){ # Creating the argument parsing options option_list = list( make_option(c("-i", "--input"), action="store", default=NA, type='character', help="Input Hi-C contacts"), # 增加对 mcool 格式的兼容 make_option(c("-t", "--type"), action="store", default='hic', type='character', help="The type of input: hic or cool or mcool [default %default]"), make_option(c("-b", "--bin_size"), action="store", default=50000, type='integer', help="Bin size to use for the analysis [default %default]"), make_option(c("-g", "--genome"), action="store", default="hg19", type='character', help="Genome assembly to use [default %default]"), make_option(c("-f", "--feature_track"), action="store", default=NA, type='character', help="Genomic feature track to be used to determine A/B compartment direction when genome == 'others'. The track should presumably have higher values in A than in B compartmnets. [default %default]"), # 增加了原染色体前缀参数 make_option(c("--chr_prefix"), action="store", default="chr", type='character', help="old chr prefix [default %default]"), make_option(c("-c", "--chromosomes"), action='store', default='all', type='character', help="Chromosomes to analyze, separated by comma. [default %default]"), make_option(c("-p", "--nproc"), action="store", default=1, type='integer', help="Number of cores to use [default %default]"), make_option(c("-o", "--outpath"), action="store", default=NA, type='character', help="Path to the output folder"), make_option(c("-k", "--keep_intermediate"), action="store_true", default=FALSE, type='logical', help="Keep intermediate data after done [default %default]"), make_option(c("-a", "--adaptive"), action="store_true", default=FALSE, type='logical', help="Use adaptive resolution choice [default %default]") ) parser <- OptionParser(usage = "%prog [options]", option_list=option_list) opt <- parse_args(parser) # Checking if input path exists if(is.na(opt$input)){ print_help(parser) stop(paste0("Input path (", opt$input,") does not exist")) } # Checking if output path is provided if(is.na(opt$outpath)){ stop("Output path was not provided") } # Check that the input type is one of the possible ones if(!(opt$type %in% INPUT_TYPES)){ stop(paste0("Input type ", opt$input_type, " not available")) } # Check if the provided genome is in the list of available reference genomes # or if a feature track is provided if((!(opt$genome %in% AVAILABLE_REFERENCE_TRACKS_GENOMES)) || (file.exists(opt$feature_track))){ # in this case, we just assign it the name 'others' opt$genome = "others" } writeLines(c( "*******************************", "* CALDER *", "*******************************", paste0("[Parameters] Input: ", opt$input), paste0("[Parameters] Input type: ", opt$type), paste0("[Parameters] Bin size: ", opt$bin_size), paste0("[Parameters] Genome: ", opt$genome), paste0("[Parameters] Feature Track: ", opt$feature_track), paste0("[Parameters] Chr Prefix: ", opt$chr_prefix), paste0("[Parameters] Chromosomes: ", opt$chromosomes), paste0("[Parameters] N. cores: ", opt$nproc), paste0("[Parameters] Output: ", opt$outpath), paste0("[Parameters] Keep Intermediate data: ", opt$keep_intermediate), paste0("[Parameters] Use adaptive resolution: ", opt$adaptive) )) if(file.exists(opt$feature_track)){ opt$feature_track <- read.table(opt$feature_track, header = F) } return(opt) } sanitize_chroms <- function(chroms){ res <- lapply(chroms, function(x){ if(startsWith(x, opt$chr_prefix)){ return(substring(x, 4)) } else{ return(x) } }) return(res) } handle_input_hic <- function(opt){ suppressPackageStartupMessages(library(strawr)) chromsizes <- readHicChroms(opt$input) if(opt$chromosomes == "all"){ chroms <- chromsizes[!(toupper(chromsizes$name) %in% toupper(CHROMS_TO_REMOVE)), "name"] } else{ chrom_list <- strsplit(opt$chromosomes, ",")[[1]] chroms <- chromsizes[chromsizes$name %in% chrom_list, "name"] } chroms <- sanitize_chroms(chroms) CALDER(contact_file_hic = opt$input, chrs = chroms, bin_size = opt$bin_size, genome = opt$genome, save_dir=opt$outpath, save_intermediate_data=TRUE, feature_track=opt$feature_track, single_binsize_only=!opt$adaptive, n_cores = opt$nproc, sub_domains=T) } # 增加了对 mcool 的支持 handle_input_mcool <- function(opt){ intermediate_data_dir = file.path(opt$outpath, "intermediate_data") dir.create(intermediate_data_dir, recursive=TRUE, showWarnings=FALSE) opt$feature_track[, 1] = gsub(paste0("^", opt$chr_prefix), "chr", opt$feature_track[, 1]) system(paste0("cooler dump --table chroms --out ", file.path(intermediate_data_dir, "chroms.txt"), " --header ", opt$input, "::/resolutions/", opt$bin_size )) chroms <- read.table(file.path(intermediate_data_dir, "chroms.txt"), sep="\t", header=TRUE) if(opt$chromosomes == "all"){ chroms <- chroms[!( toupper(chroms$name) %in% toupper(CHROMS_TO_REMOVE) ), "name"] } else{ chrom_list <- strsplit(opt$chromosomes, ",")[[1]] chroms <- chroms[chroms$name %in% chrom_list, "name"] } dump_paths <- list() for(chrom in chroms){ cat(paste0("[Pre-processing] Dumping ", chrom, "\n")) chrom_dump_path <- file.path(intermediate_data_dir, paste0(chrom, "_dump.txt")) dump_paths <- c(dump_paths, chrom_dump_path) if(! file.exists(chrom_dump_path)){ system(paste0("cooler dump --table pixels --range ", chrom, " --join --balanced ", opt$input, "::/resolutions/", opt$bin_size, " | cut -f2,5,8 | awk '{if ($3) print;}' > ", chrom_dump_path)) } } chroms <- sanitize_chroms(chroms) names(dump_paths) <- chroms CALDER(contact_file_dump=dump_paths, chrs=chroms, bin_size=opt$bin_size, genome=opt$genome, save_dir=opt$outpath, feature_track=opt$feature_track, single_binsize_only=!opt$adaptive, save_intermediate_data=TRUE, n_cores=opt$nproc, sub_domains=T) } handle_input_cool <- function(opt){ intermediate_data_dir = file.path(opt$outpath, "intermediate_data") dir.create(intermediate_data_dir, recursive=TRUE, showWarnings=FALSE) # 处理染色体前缀 opt$feature_track[, 1] = gsub(paste0("^", opt$chr_prefix), "chr", opt$feature_track[, 1]) system(paste0("cooler dump --table chroms --out ", file.path(intermediate_data_dir, "chroms.txt"), " --header ", opt$input )) chroms <- read.table(file.path(intermediate_data_dir, "chroms.txt"), sep="\t", header=TRUE) if(opt$chromosomes == "all"){ chroms <- chroms[!( toupper(chroms$name) %in% toupper(CHROMS_TO_REMOVE) ), "name"] } else{ chrom_list <- strsplit(opt$chromosomes, ",")[[1]] chroms <- chroms[chroms$name %in% chrom_list, "name"] } dump_paths <- list() for(chrom in chroms){ cat(paste0("[Pre-processing] Dumping ", chrom, "\n")) chrom_dump_path <- file.path(intermediate_data_dir, paste0(chrom, "_dump.txt")) dump_paths <- c(dump_paths, chrom_dump_path) if(! file.exists(chrom_dump_path)){ system(paste0("cooler dump --table pixels --range ", chrom, " --join --balanced ", opt$input, " | cut -f2,5,8 | awk '{if ($3) print;}' > ", chrom_dump_path)) } } chroms <- sanitize_chroms(chroms) names(dump_paths) <- chroms CALDER(contact_file_dump=dump_paths, chrs=chroms, bin_size=opt$bin_size, genome=opt$genome, save_dir=opt$outpath, feature_track=opt$feature_track, single_binsize_only=!opt$adaptive, save_intermediate_data=TRUE, n_cores=opt$nproc, sub_domains=T) } opt <- parse_arguments() if(opt$type == "hic"){ handle_input_hic(opt) } else if(opt$type == "mcool"){ handle_input_mcool(opt) } else if(opt$type == "cool"){ handle_input_cool(opt) } else { stop("Unknown input type") } # Cleaning the output intermediate_data_dir = file.path(opt$outpath, "intermediate_data") if(dir.exists(intermediate_data_dir) && (!opt$keep_intermediate)){ cat('[Post-processing] Removing intermediate data\n') unlink(intermediate_data_dir, recursive=TRUE) } exec_time_file = "./total_execution.time" if(file.exists(exec_time_file)){ cat("[Post-processing] Removing total_execution.time\n") file.remove(exec_time_file) } # 将 chr 再重新替换为原来的前缀 replace_chr_in_files <- function(dir_path, old_prefix, new_prefix) { # 定义常见的文本文件扩展名 text_file_extensions <- c("txt", "tsv", "csv", "bed") # 获取目录中的所有文件和子目录 files <- list.files(dir_path, recursive = TRUE, full.names = TRUE) for (file in files) { # 获取文件扩展名 file_ext <- tools::file_ext(file) # 检查文件是否为文本文件 if (file.info(file)$isdir == FALSE && file_ext %in% text_file_extensions) { tryCatch({ # 读取文件内容 content <- readLines(file) # 替换行首的 'chr' 为 'Chr' content <- gsub(paste0("^", old_prefix), new_prefix, content) # 写回文件 writeLines(content, file) cat("Processed:", file, "\n") }, error = function(e) { cat("Failed to process:", file, "\n", e, "\n") }) } } } # 示例用法 replace_chr_in_files(opt$outpath, "^chr", opt$chr_prefix)