bowtie2bamqc.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. #!/usr/bin/env python3
  2. import pysam
  3. import argparse
  4. import os
  5. import glob
  6. def read_list_from_file(filename):
  7. '''Read a list of chromosome names from a file.'''
  8. with open(filename, 'r') as file:
  9. return [line.strip() for line in file]
  10. def clean_filename(filename):
  11. '''Remove various BAM-related extensions from a filename.'''
  12. for ext in [".dup.bam", ".sort.bam", ".sorted.bam", ".bam"]:
  13. if filename.endswith(ext):
  14. return filename[:-len(ext)]
  15. return filename
  16. def bam_stats(bam_file, sample_name, quality_threshold=None, blackchr1_chromosomes_file=None, blackchr2_file=None, filter_output=None):
  17. '''Calculate statistics from a BAM file and optionally filter.'''
  18. samfile = pysam.AlignmentFile(bam_file, "rb")
  19. blackchr1_chromosomes = read_list_from_file(blackchr1_chromosomes_file) if blackchr1_chromosomes_file else []
  20. blackchr2_chromosomes = read_list_from_file(blackchr2_file) if blackchr2_file else []
  21. total_reads = 0
  22. mapped_reads = 0
  23. multi_mapped_reads = 0
  24. duplicates = 0
  25. low_quality = 0
  26. blackchr1_mapped_reads = 0
  27. blackchr2_mapped_reads = 0
  28. passed_filter_reads = 0
  29. output_file = None
  30. if filter_output:
  31. output_file = pysam.AlignmentFile(filter_output, "wb", header=samfile.header)
  32. for read in samfile:
  33. total_reads += 1
  34. is_filter_passed = True
  35. if read.is_duplicate:
  36. duplicates += 1
  37. is_filter_passed = False
  38. if quality_threshold and read.mapping_quality < quality_threshold:
  39. low_quality += 1
  40. is_filter_passed = False
  41. if not read.is_unmapped:
  42. mapped_reads += 1
  43. if read.has_tag("XS"):
  44. multi_mapped_reads += 1
  45. is_filter_passed = False
  46. if read.reference_name in blackchr1_chromosomes:
  47. blackchr1_mapped_reads += 1
  48. is_filter_passed = False
  49. if read.reference_name in blackchr2_chromosomes:
  50. blackchr2_mapped_reads += 1
  51. is_filter_passed = False
  52. if is_filter_passed:
  53. passed_filter_reads += 1
  54. if output_file:
  55. output_file.write(read)
  56. samfile.close()
  57. if output_file:
  58. output_file.close()
  59. return {
  60. "sample_name": sample_name,
  61. "total_reads": total_reads,
  62. "mapped_reads": mapped_reads,
  63. "multi_mapped_reads": multi_mapped_reads,
  64. "duplicates": duplicates,
  65. "low_quality": low_quality,
  66. "blackchr1_mapped_reads": blackchr1_mapped_reads,
  67. "blackchr2_mapped_reads": blackchr2_mapped_reads,
  68. "passed_filter_reads": passed_filter_reads
  69. }
  70. def summary_statistics(files):
  71. '''Summarize statistics from multiple BAM statistics files, including percentages.'''
  72. summary_data = {}
  73. for file in files:
  74. sample_name = os.path.basename(file).split('.')[0]
  75. with open(file, 'r') as f:
  76. for line in f:
  77. if line.startswith("Statistics"):
  78. continue
  79. parts = line.strip().split("\t")
  80. category = parts[0]
  81. value = parts[1]
  82. if category not in summary_data:
  83. summary_data[category] = []
  84. summary_data[category].append(value)
  85. headers = ["Category"] + [os.path.basename(file).split('.')[0] for file in files]
  86. print("\t".join(headers))
  87. for category, values in summary_data.items():
  88. print(f"{category}\t" + "\t".join(values))
  89. if __name__ == "__main__":
  90. parser = argparse.ArgumentParser(description="Stat BAM file.")
  91. parser.add_argument("-a", "--action", choices=['qc', 'summary'], required=True, help="Action to perform: 'qc' for quality control, 'summary' for summarizing multiple samples.")
  92. parser.add_argument("files", nargs='+', help="Path to the BAM file or statistics files.")
  93. parser.add_argument("--sample_name", help="Sample name for the output statistics. If not provided, it's inferred from the BAM filename.")
  94. parser.add_argument("-q", "--quality", type=int, help="Quality threshold for low-quality mappings.")
  95. parser.add_argument("--blackchr1", help="Path to the blackchr1 chromosomes file.")
  96. parser.add_argument("--blackchr2", help="Path to the blackchr2 chromosomes file.")
  97. parser.add_argument("-f", "--filter", help="Path to output filtered BAM file.")
  98. args = parser.parse_args()
  99. if args.action == 'qc':
  100. for bam_file in args.files:
  101. sample_name = args.sample_name if args.sample_name else clean_filename(os.path.basename(bam_file))
  102. result = bam_stats(bam_file, sample_name, args.quality, args.blackchr1, args.blackchr2, args.filter)
  103. headers = ["Statistics", result["sample_name"]]
  104. print("\t".join(headers))
  105. categories = [
  106. "total_reads",
  107. "mapped_reads",
  108. "multi_mapped_reads",
  109. "duplicates",
  110. "low_quality",
  111. "blackchr1_mapped_reads",
  112. "blackchr2_mapped_reads",
  113. "passed_filter_reads"
  114. ]
  115. for category in categories:
  116. count = result[category]
  117. percentage = (count / result["total_reads"]) * 100 if result["total_reads"] else 0
  118. print(f"{category.replace('_', ' ').capitalize()}\t{count} ({percentage:.2f}%)")
  119. elif args.action == 'summary':
  120. summary_statistics(args.files)