zhxd2
/
RnaSnake


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
							#!/usr/bin/env python
#python 2.7.5 requires biopython
#fetchClusterSeqs.py
#Version 1. Adam Taranto, April 2015
#Contact, Adam Taranto, adam.taranto@anu.edu.au

''' 
fetchClusterSeq is a helper tool for processing output from the R Package Corset (Davidson and Oshlack, 2014)

Takes as input: a list of target cluster names,
transcript-to-cluster mapping file generated by corset, and a fasta file containing transcript 
sequences. 

Returns: Fasta file of transcripts belonging to query clusters, with Cluster ID 
appended to their original sequence name. 
'''

import csv
import sys
import argparse
from Bio import SeqIO
from Bio.Seq import Seq
from os.path import basename

def main(inFasta=None, targetClust=None, outFasta='filtered_seqs.fa', clustMap=None, getLong=False):
    if inFasta is None:
        sys.exit('No input fasta provided')

    if targetClust is None:
        print('No target cluster list provided. Reporting for all clusters.')

    if clustMap is None:
        sys.exit('No cluster mapping file provided')

    #Read transcript-to-cluster map into dictionary object called 'clustMem'.
    clustMem = clustDict(clustMap)

    #Read fasta records into dictionary called 'SeqMaster'.
    SeqMaster = getFasta(inFasta)

    #Open list of target clusters
    if targetClust:
        with open(targetClust) as f:
            reader = [line.rstrip('\n').split() for line in f]
        getClusters = list()
        for row in reader:
            if not getClusters.count(row[0]):
                getClusters.append(row[0])
    else:
        getClusters = list(clustMem.keys())

    #Open output fasta
    fasta_file = open(outFasta,'w')

    #Open log file for names not found in master set
    if targetClust:
        error_list = open(str('NotFound_' + basename(targetClust) + '.log'),'w')
    else:
        error_list = open(str('NotFound_Clusters_Transcripts.log'),'w')

    if getLong:
        reportLongSeqs(getClusters,fasta_file,error_list,clustMem,SeqMaster)
    else:
        reportSeqs(getClusters,fasta_file,error_list,clustMem,SeqMaster)

def reportSeqs(getClusters,fasta_file,error_list,clustMem,SeqMaster):
    #Write records for transcripts belonging to target clusters to new fasta
    for name in getClusters:
        #Check if target cluster exists in cluster-map dictionary
        if name in clustMem:
            #For each transcript belonging to target cluster
            for trans in clustMem[name]:
                #Check if target transcript exists in fasta dictionary
                try:
                    SeqMaster[trans]
                #If target transcript not in fasta, log error.
                except:
                    print(('Transcript not in ref fasta: ' + name + ': ' + trans))
                    error_list.write(name + "\t" + trans + "\n")
                #If target transcript is in fasta, append cluster name and print to output fasta.
                else:
                    fasta_name= ">%s" % (name + '_' + trans)
                    fasta_seq= "%s" %(SeqMaster[trans])
                    fasta_file.write(fasta_name+"\n")
                    fastaLines(seq=fasta_seq, n=80, file=fasta_file)
        #If target cluster was not in map file, log error.
        else:
            print(('Target cluster not in Map file: ' + name))
            error_list.write(name + "\n")
    fasta_file.close()
    error_list.close()

def reportLongSeqs(getClusters,fasta_file,error_list,clustMem,SeqMaster):
    #Write records for transcripts belonging to target clusters to new fasta
    for name in getClusters:
        longestSeqLen = 0
        #Check if target cluster exists in cluster-map dictionary
        if name in clustMem:
            #For each transcript belonging to target cluster
            for trans in clustMem[name]:
                #Check if target transcript exists in fasta dictionary
                try:
                    SeqMaster[trans]
                #If target transcript not in fasta, log error.
                except:
                    print(('Transcript not in ref fasta: ' + name + ': ' + trans))
                    error_list.write(name + "\t" + trans + "\n")
                #If target transcript is in fasta, append cluster name and print to output fasta.
                else:
                    if len(SeqMaster[trans]) >= longestSeqLen:
                        longSeq = SeqMaster[trans]
                        longestSeqLen = len(SeqMaster[trans])
                        longSeqName = trans
            #print longest
            fasta_name= ">%s" % (name + '_' + longSeqName)
            fasta_seq= "%s" %(longSeq)
            fasta_file.write(fasta_name+"\n")
            fastaLines(seq=fasta_seq, n=80, file=fasta_file)
        #If target cluster was not in map file,log error.
        else:
            print(('Target cluster not in Map file: ' + name))
            error_list.write(name + "\n")
    fasta_file.close()
    error_list.close()

def fastaLines(seq=None, n=80, file=None):
    """Yield successive n-sized chunks from seq."""
    for i in range(0, len(seq), n):
        file.write(seq[i:i+n] + "\n")

def clustDict(clustMap):
    #Read transcript-to-cluster mapping file into dict object.
    #Sample data row:
    #TranscriptID   ClusterID
    #nnt3Ldvymb Cluster-0.0
    with open(clustMap) as mapFile:
        readMap = [line.rstrip('\n').split() for line in mapFile]
    clustMem={}
    #Write records for seqs in name file to new fasta
    for row in readMap:
        transID=row[0]
        clustID=row[1]
        if clustID not in clustMem:
            clustMem[clustID] = list()
        clustMem[clustID].append(transID)
    return clustMem
    
def getFasta(inFasta):
    #Create empty dictionary
    SeqMaster={}
    #Populate dictionary with master set of fasta records
    for seq_record in SeqIO.parse(inFasta, "fasta"):
        SeqMaster[seq_record.id]=str(seq_record.seq)
    return SeqMaster

def mainArgs():
    """Process command-line arguments"""
    parser = argparse.ArgumentParser(description='Takes as input: a list of target cluster names, \
        transcript-to-cluster mapping file generated by corset, and a fasta file containing transcript \
        sequences. Returns: Fasta file of transcripts belonging to target clusters, with Cluster ID \
        appended to their original sequence name.', prog='fetchClusterSeqs')
    parser.add_argument("-i","--inFasta", 
                        required=True, 
                        help="Multi fasta to extract subset from")
    parser.add_argument("-t","--targetClust", 
                        default=None,
                        help="Text file with target cluster names in column one. If not provided \
                        will return transcripts for all clusters.")
    parser.add_argument("-o","--outFasta", 
                        default='filtered_seqs.fa', 
                        help="Directory for new sequence file to be written to.")
    parser.add_argument("-c","--clustMap", 
                        required=True, 
                        help="Corset transcript-to-cluster mapping file.")
    parser.add_argument("-l","--longest", 
                        action='store_true', 
                        default=False, 
                        help="If set, return only longest transcript per cluster.")
    args = parser.parse_args()
    return args

if __name__== '__main__':
    args = mainArgs()
    main(inFasta=args.inFasta, targetClust=args.targetClust, outFasta=args.outFasta, clustMap=args.clustMap, getLong=args.longest)