gff3_parser.py 3.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. #!/usr/bin/env python3
  2. """
  3. A simple parser for the GFF3 format.
  4. Test with transcripts.gff3 from
  5. http://www.broadinstitute.org/annotation/gebo/help/gff3.html.
  6. Format specification source:
  7. http://www.sequenceontology.org/gff3.shtml
  8. Version 1.1: Python3 ready
  9. """
  10. from collections import namedtuple
  11. import gzip
  12. import urllib.request, urllib.parse, urllib.error
  13. #Initialized GeneInfo named tuple. Note: namedtuple is immutable
  14. gffInfoFields = ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"]
  15. GFFRecord = namedtuple("GFFRecord", gffInfoFields)
  16. def parseGFFAttributes(attributeString):
  17. """Parse the GFF3 attribute column and return a dict"""#
  18. if attributeString == ".": return {}
  19. ret = {}
  20. for attribute in attributeString.split(";"):
  21. key, value = attribute.split("=")
  22. ret[urllib.parse.unquote(key)] = urllib.parse.unquote(value)
  23. return ret
  24. def parseGFF3(filename):
  25. """
  26. A minimalistic GFF3 format parser.
  27. Yields objects that contain info about a single GFF3 feature.
  28. Supports transparent gzip decompression.
  29. """
  30. #Parse with transparent decompression
  31. openFunc = gzip.open if filename.endswith(".gz") else open
  32. with openFunc(filename) as infile:
  33. for line in infile:
  34. if line.startswith("#"): continue
  35. parts = line.strip().split("\t")
  36. #If this fails, the file format is not standard-compatible
  37. assert len(parts) == len(gffInfoFields)
  38. #Normalize data
  39. normalizedInfo = {
  40. "seqid": None if parts[0] == "." else urllib.parse.unquote(parts[0]),
  41. "source": None if parts[1] == "." else urllib.parse.unquote(parts[1]),
  42. "type": None if parts[2] == "." else urllib.parse.unquote(parts[2]),
  43. "start": None if parts[3] == "." else int(parts[3]),
  44. "end": None if parts[4] == "." else int(parts[4]),
  45. "score": None if parts[5] == "." else float(parts[5]),
  46. "strand": None if parts[6] == "." else urllib.parse.unquote(parts[6]),
  47. "phase": None if parts[7] == "." else urllib.parse.unquote(parts[7]),
  48. "attributes": parseGFFAttributes(parts[8])
  49. }
  50. #Alternatively, you can emit the dictionary here, if you need mutability:
  51. # yield normalizedInfo
  52. yield GFFRecord(**normalizedInfo)
  53. if __name__ == "__main__":
  54. import argparse
  55. parser = argparse.ArgumentParser()
  56. parser.add_argument("file", help="The GFF3 input file (.gz allowed)")
  57. parser.add_argument("--print-records", action="store_true", help="Print all GeneInfo objects, not only")
  58. parser.add_argument("--filter-type", help="Ignore records not having the given type")
  59. args = parser.parse_args()
  60. #Execute the parser
  61. recordCount = 0
  62. for record in parseGFF3(args.file):
  63. #Apply filter, if any
  64. if args.filter_type and record.type != args.filter_type:
  65. continue
  66. #Print record if specified by the user
  67. if args.print_records: print(record)
  68. #Access attributes like this: my_strand = record.strand
  69. recordCount += 1
  70. print("Total records: %d" % recordCount)