scc.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. import argparse
  2. import cooler
  3. import logging
  4. import pandas as pd
  5. from hicrep import hicrepSCC
  6. from itertools import combinations
  7. import time
  8. from concurrent.futures import ProcessPoolExecutor
  9. # 设置日志配置
  10. logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
  11. def calculate_pair_scc(pair, h, dBPMax, bDownSample):
  12. (file1, name1), (file2, name2) = pair # 解包每个元组
  13. logging.info(f"Starting SCC calculation between {name1} and {name2}")
  14. cool1 = cooler.Cooler(file1)
  15. cool2 = cooler.Cooler(file2)
  16. binSize1 = cool1.info['bin-size']
  17. binSize2 = cool2.info['bin-size']
  18. chromosomes = cool1.chromnames
  19. if binSize1 != binSize2:
  20. logging.error("Bin sizes of the two cool files must be equal.")
  21. raise ValueError("Bin sizes of the two cool files must be equal.")
  22. start_time = time.time()
  23. scc_scores = hicrepSCC(cool1, cool2, h, dBPMax, bDownSample)
  24. elapsed_time = time.time() - start_time
  25. logging.info(f"Finished SCC calculation between {name1} and {name2} in {elapsed_time:.2f} seconds")
  26. results = [{'SampleA': name1, 'SampleB': name2, 'Chromosome': chrom, 'SCC': scc}
  27. for chrom, scc in zip(chromosomes, scc_scores)]
  28. return results
  29. def compute_scc(files, sample_names, h, dBPMax, bDownSample, num_workers=4):
  30. pairs = list(combinations(zip(files, sample_names), 2))
  31. results = []
  32. with ProcessPoolExecutor(max_workers=num_workers) as executor:
  33. futures = [executor.submit(calculate_pair_scc, pair, h, dBPMax, bDownSample) for pair in pairs]
  34. for future in futures:
  35. results.extend(future.result())
  36. return results
  37. def main():
  38. parser = argparse.ArgumentParser(description="Compute SCC for pairs of .cool files.")
  39. parser.add_argument('--files', nargs='+', required=True, help="List of .cool files to process.")
  40. parser.add_argument('--sample_names', nargs='+', required=True, help="List of sample names corresponding to the .cool files.")
  41. parser.add_argument('--output', type=str, required=True, help="Output file path to store the results.")
  42. parser.add_argument('--h', type=int, default=1, help="Smoothing window radius.")
  43. parser.add_argument('--dBPMax', type=int, default=500000, help="Maximum genomic distance.")
  44. parser.add_argument('--bDownSample', action='store_true', help="Whether to perform downsampling.")
  45. parser.add_argument('--workers', type=int, default=4, help="Number of workers for parallel computation.")
  46. args = parser.parse_args()
  47. if len(args.files) != len(args.sample_names):
  48. logging.error("The number of files must match the number of sample names provided.")
  49. return
  50. logging.info("Starting SCC computations for all combinations of provided .cool files.")
  51. scc_results = compute_scc(args.files, args.sample_names, args.h, args.dBPMax, args.bDownSample, args.workers)
  52. df = pd.DataFrame(scc_results)
  53. df.to_csv(args.output, index=False)
  54. logging.info(f"Results have been saved to {args.output}")
  55. if __name__ == "__main__":
  56. main()