zhxd2
/
HicSnake


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
							import argparse
import cooler
import logging
import pandas as pd
from hicrep import hicrepSCC
from itertools import combinations
import time
from concurrent.futures import ProcessPoolExecutor

# 设置日志配置
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def calculate_pair_scc(pair, h, dBPMax, bDownSample):
    (file1, name1), (file2, name2) = pair  # 解包每个元组
    logging.info(f"Starting SCC calculation between {name1} and {name2}")

    cool1 = cooler.Cooler(file1)
    cool2 = cooler.Cooler(file2)
    binSize1 = cool1.info['bin-size']
    binSize2 = cool2.info['bin-size']
    chromosomes = cool1.chromnames

    if binSize1 != binSize2:
        logging.error("Bin sizes of the two cool files must be equal.")
        raise ValueError("Bin sizes of the two cool files must be equal.")

    start_time = time.time()
    scc_scores = hicrepSCC(cool1, cool2, h, dBPMax, bDownSample)
    elapsed_time = time.time() - start_time
    logging.info(f"Finished SCC calculation between {name1} and {name2} in {elapsed_time:.2f} seconds")

    results = [{'SampleA': name1, 'SampleB': name2, 'Chromosome': chrom, 'SCC': scc}
               for chrom, scc in zip(chromosomes, scc_scores)]
    return results

def compute_scc(files, sample_names, h, dBPMax, bDownSample, num_workers=4):
    pairs = list(combinations(zip(files, sample_names), 2))
    results = []
    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        futures = [executor.submit(calculate_pair_scc, pair, h, dBPMax, bDownSample) for pair in pairs]
        for future in futures:
            results.extend(future.result())
    return results

def main():
    parser = argparse.ArgumentParser(description="Compute SCC for pairs of .cool files.")
    parser.add_argument('--files', nargs='+', required=True, help="List of .cool files to process.")
    parser.add_argument('--sample_names', nargs='+', required=True, help="List of sample names corresponding to the .cool files.")
    parser.add_argument('--output', type=str, required=True, help="Output file path to store the results.")
    parser.add_argument('--h', type=int, default=1, help="Smoothing window radius.")
    parser.add_argument('--dBPMax', type=int, default=500000, help="Maximum genomic distance.")
    parser.add_argument('--bDownSample', action='store_true', help="Whether to perform downsampling.")
    parser.add_argument('--workers', type=int, default=4, help="Number of workers for parallel computation.")

    args = parser.parse_args()

    if len(args.files) != len(args.sample_names):
        logging.error("The number of files must match the number of sample names provided.")
        return

    logging.info("Starting SCC computations for all combinations of provided .cool files.")
    scc_results = compute_scc(args.files, args.sample_names, args.h, args.dBPMax, args.bDownSample, args.workers)
    
    df = pd.DataFrame(scc_results)
    df.to_csv(args.output, index=False)
    logging.info(f"Results have been saved to {args.output}")

if __name__ == "__main__":
    main()