import argparse import cooler import logging import pandas as pd from hicrep import hicrepSCC from itertools import combinations import time from concurrent.futures import ProcessPoolExecutor # 设置日志配置 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') def calculate_pair_scc(pair, h, dBPMax, bDownSample): (file1, name1), (file2, name2) = pair # 解包每个元组 logging.info(f"Starting SCC calculation between {name1} and {name2}") cool1 = cooler.Cooler(file1) cool2 = cooler.Cooler(file2) binSize1 = cool1.info['bin-size'] binSize2 = cool2.info['bin-size'] chromosomes = cool1.chromnames if binSize1 != binSize2: logging.error("Bin sizes of the two cool files must be equal.") raise ValueError("Bin sizes of the two cool files must be equal.") start_time = time.time() scc_scores = hicrepSCC(cool1, cool2, h, dBPMax, bDownSample) elapsed_time = time.time() - start_time logging.info(f"Finished SCC calculation between {name1} and {name2} in {elapsed_time:.2f} seconds") results = [{'SampleA': name1, 'SampleB': name2, 'Chromosome': chrom, 'SCC': scc} for chrom, scc in zip(chromosomes, scc_scores)] return results def compute_scc(files, sample_names, h, dBPMax, bDownSample, num_workers=4): pairs = list(combinations(zip(files, sample_names), 2)) results = [] with ProcessPoolExecutor(max_workers=num_workers) as executor: futures = [executor.submit(calculate_pair_scc, pair, h, dBPMax, bDownSample) for pair in pairs] for future in futures: results.extend(future.result()) return results def main(): parser = argparse.ArgumentParser(description="Compute SCC for pairs of .cool files.") parser.add_argument('--files', nargs='+', required=True, help="List of .cool files to process.") parser.add_argument('--sample_names', nargs='+', required=True, help="List of sample names corresponding to the .cool files.") parser.add_argument('--output', type=str, required=True, help="Output file path to store the results.") parser.add_argument('--h', type=int, default=1, help="Smoothing window radius.") parser.add_argument('--dBPMax', type=int, default=500000, help="Maximum genomic distance.") parser.add_argument('--bDownSample', action='store_true', help="Whether to perform downsampling.") parser.add_argument('--workers', type=int, default=4, help="Number of workers for parallel computation.") args = parser.parse_args() if len(args.files) != len(args.sample_names): logging.error("The number of files must match the number of sample names provided.") return logging.info("Starting SCC computations for all combinations of provided .cool files.") scc_results = compute_scc(args.files, args.sample_names, args.h, args.dBPMax, args.bDownSample, args.workers) df = pd.DataFrame(scc_results) df.to_csv(args.output, index=False) logging.info(f"Results have been saved to {args.output}") if __name__ == "__main__": main()