hicres.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. import cooler
  2. import pandas as pd
  3. import numpy as np
  4. import argparse
  5. from concurrent.futures import ThreadPoolExecutor, as_completed
  6. def calculate_valid_bins_percentage(file_path, threshold=1000, num_threads=1, chunk_size=100000, resolutions=None):
  7. if file_path.endswith('.mcool'):
  8. return calculate_mcool_valid_bins_percentage(file_path, threshold, num_threads, chunk_size, resolutions)
  9. else:
  10. return calculate_cool_valid_bins_percentage(file_path, threshold, chunk_size)
  11. def calculate_cool_valid_bins_percentage(file_path, threshold=1000, chunk_size=100000):
  12. clr = cooler.Cooler(file_path)
  13. bins = clr.bins()[:]
  14. total_valid_bins = 0
  15. total_bins = 0
  16. for start in range(0, len(bins), chunk_size):
  17. end = min(start + chunk_size, len(bins))
  18. matrix = clr.matrix(balance=False)[start:end, start:end]
  19. bin_sums = np.sum(matrix, axis=0) + np.sum(matrix, axis=1) - np.diag(matrix)
  20. valid_bins = np.sum(bin_sums > threshold)
  21. total_valid_bins += valid_bins
  22. total_bins += bin_sums.size
  23. valid_bin_percentage = (total_valid_bins / total_bins) * 100 if total_bins > 0 else 0
  24. return {
  25. 'File': file_path,
  26. 'Valid Bin Percentage': valid_bin_percentage,
  27. 'Total Valid Bins': total_valid_bins,
  28. 'Total Bins': total_bins
  29. }
  30. def calculate_mcool_valid_bins_percentage(file_path, threshold=1000, num_threads=1, chunk_size=100000, resolutions=None):
  31. available_resolutions = cooler.fileops.list_coolers(file_path)
  32. print(f"Available resolutions in {file_path}: {available_resolutions}")
  33. if resolutions:
  34. resolutions = [f'/resolutions/{res}' for res in resolutions.split(',') if f'/resolutions/{res}' in available_resolutions]
  35. print(resolutions)
  36. if not resolutions:
  37. raise ValueError(f"No matching resolutions found in the .mcool file for specified resolutions.")
  38. else:
  39. resolutions = available_resolutions
  40. results = []
  41. with ThreadPoolExecutor(max_workers=num_threads) as executor:
  42. future_to_resolution = {
  43. executor.submit(process_resolution, file_path, resolution_path, threshold, chunk_size): resolution_path
  44. for resolution_path in resolutions
  45. }
  46. for future in as_completed(future_to_resolution):
  47. resolution_path = future_to_resolution[future]
  48. try:
  49. result = future.result()
  50. if result is not None:
  51. results.append(result)
  52. except Exception as e:
  53. print(f"Error processing resolution {resolution_path}: {e}")
  54. return {
  55. 'File': file_path,
  56. 'Results': results
  57. }
  58. def process_resolution(file_path, resolution_path, threshold, chunk_size):
  59. clr_res = cooler.Cooler(f"{file_path}::{resolution_path}")
  60. bins = clr_res.bins()[:]
  61. print(f"Processing resolution: {resolution_path}")
  62. total_valid_bins = 0
  63. total_bins = 0
  64. for start in range(0, len(bins), chunk_size):
  65. end = min(start + chunk_size, len(bins))
  66. matrix = clr_res.matrix(balance=False)[start:end, start:end]
  67. bin_sums = np.sum(matrix, axis=0) + np.sum(matrix, axis=1) - np.diag(matrix)
  68. valid_bins = np.sum(bin_sums > threshold)
  69. total_valid_bins += valid_bins
  70. total_bins += bin_sums.size
  71. valid_bin_percentage = (total_valid_bins / total_bins) * 100 if total_bins > 0 else 0
  72. return {
  73. 'Resolution': resolution_path.split('/')[-1],
  74. 'Valid Bin Percentage': valid_bin_percentage,
  75. 'Total Valid Bins': total_valid_bins,
  76. 'Total Bins': total_bins
  77. }
  78. def save_results_to_csv(results, output_path):
  79. if 'Results' in results:
  80. df = pd.DataFrame(results['Results'])
  81. df.insert(0, 'File', results['File'])
  82. else:
  83. df = pd.DataFrame([results])
  84. df.to_csv(output_path, index=False)
  85. print(f"Results saved to {output_path}")
  86. def main():
  87. parser = argparse.ArgumentParser(description="Calculate valid bins percentage from .cool or .mcool files.")
  88. parser.add_argument("file_path", type=str, help="Path to the .cool or .mcool file")
  89. parser.add_argument("--output", type=str, required=True, help="Path to save the results as CSV file")
  90. parser.add_argument("--threads", type=int, default=1, help="Number of threads to use for processing")
  91. parser.add_argument("--chunk_size", type=int, default=100000, help="Chunk size for processing large matrices")
  92. parser.add_argument("--resolutions", type=str, help="Comma-separated list of resolutions to process in the .mcool file")
  93. args = parser.parse_args()
  94. try:
  95. results = calculate_valid_bins_percentage(args.file_path, num_threads=args.threads, chunk_size=args.chunk_size, resolutions=args.resolutions)
  96. save_results_to_csv(results, args.output)
  97. except Exception as e:
  98. print(f"Error processing file {args.file_path}: {e}")
  99. if __name__ == "__main__":
  100. main()