| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879 | #!/usr/bin/env python3"""A simple parser for the GFF3 format.Test with transcripts.gff3 fromhttp://www.broadinstitute.org/annotation/gebo/help/gff3.html.Format specification source:http://www.sequenceontology.org/gff3.shtmlVersion 1.1: Python3 ready"""from collections import namedtupleimport gzipimport urllib.request, urllib.parse, urllib.error#Initialized GeneInfo named tuple. Note: namedtuple is immutablegffInfoFields = ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"]GFFRecord = namedtuple("GFFRecord", gffInfoFields)def parseGFFAttributes(attributeString):    """Parse the GFF3 attribute column and return a dict"""#    if attributeString == ".": return {}    ret = {}    for attribute in attributeString.split(";"):        key, value = attribute.split("=")        ret[urllib.parse.unquote(key)] = urllib.parse.unquote(value)    return retdef parseGFF3(filename):    """    A minimalistic GFF3 format parser.    Yields objects that contain info about a single GFF3 feature.        Supports transparent gzip decompression.    """    #Parse with transparent decompression    openFunc = gzip.open if filename.endswith(".gz") else open    with openFunc(filename) as infile:        for line in infile:            if line.startswith("#"): continue            parts = line.strip().split("\t")            #If this fails, the file format is not standard-compatible            assert len(parts) == len(gffInfoFields)            #Normalize data            normalizedInfo = {                "seqid": None if parts[0] == "." else urllib.parse.unquote(parts[0]),                "source": None if parts[1] == "." else urllib.parse.unquote(parts[1]),                "type": None if parts[2] == "." else urllib.parse.unquote(parts[2]),                "start": None if parts[3] == "." else int(parts[3]),                "end": None if parts[4] == "." else int(parts[4]),                "score": None if parts[5] == "." else float(parts[5]),                "strand": None if parts[6] == "." else urllib.parse.unquote(parts[6]),                "phase": None if parts[7] == "." else urllib.parse.unquote(parts[7]),                "attributes": parseGFFAttributes(parts[8])            }            #Alternatively, you can emit the dictionary here, if you need mutability:            #    yield normalizedInfo            yield GFFRecord(**normalizedInfo)            if __name__ == "__main__":    import argparse    parser = argparse.ArgumentParser()    parser.add_argument("file", help="The GFF3 input file (.gz allowed)")    parser.add_argument("--print-records", action="store_true", help="Print all GeneInfo objects, not only")    parser.add_argument("--filter-type", help="Ignore records not having the given type")    args = parser.parse_args()    #Execute the parser    recordCount = 0    for record in parseGFF3(args.file):        #Apply filter, if any        if args.filter_type and record.type != args.filter_type:            continue        #Print record if specified by the user        if args.print_records: print(record)        #Access attributes like this: my_strand = record.strand        recordCount += 1    print("Total records: %d" % recordCount)
 |