123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869 |
- import argparse
- import cooler
- import logging
- import pandas as pd
- from hicrep import hicrepSCC
- from itertools import combinations
- import time
- from concurrent.futures import ProcessPoolExecutor
- # 设置日志配置
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
- def calculate_pair_scc(pair, h, dBPMax, bDownSample):
- (file1, name1), (file2, name2) = pair # 解包每个元组
- logging.info(f"Starting SCC calculation between {name1} and {name2}")
- cool1 = cooler.Cooler(file1)
- cool2 = cooler.Cooler(file2)
- binSize1 = cool1.info['bin-size']
- binSize2 = cool2.info['bin-size']
- chromosomes = cool1.chromnames
- if binSize1 != binSize2:
- logging.error("Bin sizes of the two cool files must be equal.")
- raise ValueError("Bin sizes of the two cool files must be equal.")
- start_time = time.time()
- scc_scores = hicrepSCC(cool1, cool2, h, dBPMax, bDownSample)
- elapsed_time = time.time() - start_time
- logging.info(f"Finished SCC calculation between {name1} and {name2} in {elapsed_time:.2f} seconds")
- results = [{'SampleA': name1, 'SampleB': name2, 'Chromosome': chrom, 'SCC': scc}
- for chrom, scc in zip(chromosomes, scc_scores)]
- return results
- def compute_scc(files, sample_names, h, dBPMax, bDownSample, num_workers=4):
- pairs = list(combinations(zip(files, sample_names), 2))
- results = []
- with ProcessPoolExecutor(max_workers=num_workers) as executor:
- futures = [executor.submit(calculate_pair_scc, pair, h, dBPMax, bDownSample) for pair in pairs]
- for future in futures:
- results.extend(future.result())
- return results
- def main():
- parser = argparse.ArgumentParser(description="Compute SCC for pairs of .cool files.")
- parser.add_argument('--files', nargs='+', required=True, help="List of .cool files to process.")
- parser.add_argument('--sample_names', nargs='+', required=True, help="List of sample names corresponding to the .cool files.")
- parser.add_argument('--output', type=str, required=True, help="Output file path to store the results.")
- parser.add_argument('--h', type=int, default=1, help="Smoothing window radius.")
- parser.add_argument('--dBPMax', type=int, default=500000, help="Maximum genomic distance.")
- parser.add_argument('--bDownSample', action='store_true', help="Whether to perform downsampling.")
- parser.add_argument('--workers', type=int, default=4, help="Number of workers for parallel computation.")
- args = parser.parse_args()
- if len(args.files) != len(args.sample_names):
- logging.error("The number of files must match the number of sample names provided.")
- return
- logging.info("Starting SCC computations for all combinations of provided .cool files.")
- scc_results = compute_scc(args.files, args.sample_names, args.h, args.dBPMax, args.bDownSample, args.workers)
-
- df = pd.DataFrame(scc_results)
- df.to_csv(args.output, index=False)
- logging.info(f"Results have been saved to {args.output}")
- if __name__ == "__main__":
- main()
|