12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879 |
- #!/usr/bin/env python3
- """
- A simple parser for the GFF3 format.
- Test with transcripts.gff3 from
- http://www.broadinstitute.org/annotation/gebo/help/gff3.html.
- Format specification source:
- http://www.sequenceontology.org/gff3.shtml
- Version 1.1: Python3 ready
- """
- from collections import namedtuple
- import gzip
- import urllib.request, urllib.parse, urllib.error
- #Initialized GeneInfo named tuple. Note: namedtuple is immutable
- gffInfoFields = ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"]
- GFFRecord = namedtuple("GFFRecord", gffInfoFields)
- def parseGFFAttributes(attributeString):
- """Parse the GFF3 attribute column and return a dict"""#
- if attributeString == ".": return {}
- ret = {}
- for attribute in attributeString.split(";"):
- key, value = attribute.split("=")
- ret[urllib.parse.unquote(key)] = urllib.parse.unquote(value)
- return ret
- def parseGFF3(filename):
- """
- A minimalistic GFF3 format parser.
- Yields objects that contain info about a single GFF3 feature.
-
- Supports transparent gzip decompression.
- """
- #Parse with transparent decompression
- openFunc = gzip.open if filename.endswith(".gz") else open
- with openFunc(filename) as infile:
- for line in infile:
- if line.startswith("#"): continue
- parts = line.strip().split("\t")
- #If this fails, the file format is not standard-compatible
- assert len(parts) == len(gffInfoFields)
- #Normalize data
- normalizedInfo = {
- "seqid": None if parts[0] == "." else urllib.parse.unquote(parts[0]),
- "source": None if parts[1] == "." else urllib.parse.unquote(parts[1]),
- "type": None if parts[2] == "." else urllib.parse.unquote(parts[2]),
- "start": None if parts[3] == "." else int(parts[3]),
- "end": None if parts[4] == "." else int(parts[4]),
- "score": None if parts[5] == "." else float(parts[5]),
- "strand": None if parts[6] == "." else urllib.parse.unquote(parts[6]),
- "phase": None if parts[7] == "." else urllib.parse.unquote(parts[7]),
- "attributes": parseGFFAttributes(parts[8])
- }
- #Alternatively, you can emit the dictionary here, if you need mutability:
- # yield normalizedInfo
- yield GFFRecord(**normalizedInfo)
-
- if __name__ == "__main__":
- import argparse
- parser = argparse.ArgumentParser()
- parser.add_argument("file", help="The GFF3 input file (.gz allowed)")
- parser.add_argument("--print-records", action="store_true", help="Print all GeneInfo objects, not only")
- parser.add_argument("--filter-type", help="Ignore records not having the given type")
- args = parser.parse_args()
- #Execute the parser
- recordCount = 0
- for record in parseGFF3(args.file):
- #Apply filter, if any
- if args.filter_type and record.type != args.filter_type:
- continue
- #Print record if specified by the user
- if args.print_records: print(record)
- #Access attributes like this: my_strand = record.strand
- recordCount += 1
- print("Total records: %d" % recordCount)
|