fetchClusterSeqs.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. #!/usr/bin/env python
  2. #python 2.7.5 requires biopython
  3. #fetchClusterSeqs.py
  4. #Version 1. Adam Taranto, April 2015
  5. #Contact, Adam Taranto, adam.taranto@anu.edu.au
  6. '''
  7. fetchClusterSeq is a helper tool for processing output from the R Package Corset (Davidson and Oshlack, 2014)
  8. Takes as input: a list of target cluster names,
  9. transcript-to-cluster mapping file generated by corset, and a fasta file containing transcript
  10. sequences.
  11. Returns: Fasta file of transcripts belonging to query clusters, with Cluster ID
  12. appended to their original sequence name.
  13. '''
  14. import csv
  15. import sys
  16. import argparse
  17. from Bio import SeqIO
  18. from Bio.Seq import Seq
  19. from os.path import basename
  20. def main(inFasta=None, targetClust=None, outFasta='filtered_seqs.fa', clustMap=None, getLong=False):
  21. if inFasta is None:
  22. sys.exit('No input fasta provided')
  23. if targetClust is None:
  24. print('No target cluster list provided. Reporting for all clusters.')
  25. if clustMap is None:
  26. sys.exit('No cluster mapping file provided')
  27. #Read transcript-to-cluster map into dictionary object called 'clustMem'.
  28. clustMem = clustDict(clustMap)
  29. #Read fasta records into dictionary called 'SeqMaster'.
  30. SeqMaster = getFasta(inFasta)
  31. #Open list of target clusters
  32. if targetClust:
  33. with open(targetClust) as f:
  34. reader = [line.rstrip('\n').split() for line in f]
  35. getClusters = list()
  36. for row in reader:
  37. if not getClusters.count(row[0]):
  38. getClusters.append(row[0])
  39. else:
  40. getClusters = list(clustMem.keys())
  41. #Open output fasta
  42. fasta_file = open(outFasta,'w')
  43. #Open log file for names not found in master set
  44. if targetClust:
  45. error_list = open(str('NotFound_' + basename(targetClust) + '.log'),'w')
  46. else:
  47. error_list = open(str('NotFound_Clusters_Transcripts.log'),'w')
  48. if getLong:
  49. reportLongSeqs(getClusters,fasta_file,error_list,clustMem,SeqMaster)
  50. else:
  51. reportSeqs(getClusters,fasta_file,error_list,clustMem,SeqMaster)
  52. def reportSeqs(getClusters,fasta_file,error_list,clustMem,SeqMaster):
  53. #Write records for transcripts belonging to target clusters to new fasta
  54. for name in getClusters:
  55. #Check if target cluster exists in cluster-map dictionary
  56. if name in clustMem:
  57. #For each transcript belonging to target cluster
  58. for trans in clustMem[name]:
  59. #Check if target transcript exists in fasta dictionary
  60. try:
  61. SeqMaster[trans]
  62. #If target transcript not in fasta, log error.
  63. except:
  64. print(('Transcript not in ref fasta: ' + name + ': ' + trans))
  65. error_list.write(name + "\t" + trans + "\n")
  66. #If target transcript is in fasta, append cluster name and print to output fasta.
  67. else:
  68. fasta_name= ">%s" % (name + '_' + trans)
  69. fasta_seq= "%s" %(SeqMaster[trans])
  70. fasta_file.write(fasta_name+"\n")
  71. fastaLines(seq=fasta_seq, n=80, file=fasta_file)
  72. #If target cluster was not in map file, log error.
  73. else:
  74. print(('Target cluster not in Map file: ' + name))
  75. error_list.write(name + "\n")
  76. fasta_file.close()
  77. error_list.close()
  78. def reportLongSeqs(getClusters,fasta_file,error_list,clustMem,SeqMaster):
  79. #Write records for transcripts belonging to target clusters to new fasta
  80. for name in getClusters:
  81. longestSeqLen = 0
  82. #Check if target cluster exists in cluster-map dictionary
  83. if name in clustMem:
  84. #For each transcript belonging to target cluster
  85. for trans in clustMem[name]:
  86. #Check if target transcript exists in fasta dictionary
  87. try:
  88. SeqMaster[trans]
  89. #If target transcript not in fasta, log error.
  90. except:
  91. print(('Transcript not in ref fasta: ' + name + ': ' + trans))
  92. error_list.write(name + "\t" + trans + "\n")
  93. #If target transcript is in fasta, append cluster name and print to output fasta.
  94. else:
  95. if len(SeqMaster[trans]) >= longestSeqLen:
  96. longSeq = SeqMaster[trans]
  97. longestSeqLen = len(SeqMaster[trans])
  98. longSeqName = trans
  99. #print longest
  100. fasta_name= ">%s" % (name + '_' + longSeqName)
  101. fasta_seq= "%s" %(longSeq)
  102. fasta_file.write(fasta_name+"\n")
  103. fastaLines(seq=fasta_seq, n=80, file=fasta_file)
  104. #If target cluster was not in map file,log error.
  105. else:
  106. print(('Target cluster not in Map file: ' + name))
  107. error_list.write(name + "\n")
  108. fasta_file.close()
  109. error_list.close()
  110. def fastaLines(seq=None, n=80, file=None):
  111. """Yield successive n-sized chunks from seq."""
  112. for i in range(0, len(seq), n):
  113. file.write(seq[i:i+n] + "\n")
  114. def clustDict(clustMap):
  115. #Read transcript-to-cluster mapping file into dict object.
  116. #Sample data row:
  117. #TranscriptID ClusterID
  118. #nnt3Ldvymb Cluster-0.0
  119. with open(clustMap) as mapFile:
  120. readMap = [line.rstrip('\n').split() for line in mapFile]
  121. clustMem={}
  122. #Write records for seqs in name file to new fasta
  123. for row in readMap:
  124. transID=row[0]
  125. clustID=row[1]
  126. if clustID not in clustMem:
  127. clustMem[clustID] = list()
  128. clustMem[clustID].append(transID)
  129. return clustMem
  130. def getFasta(inFasta):
  131. #Create empty dictionary
  132. SeqMaster={}
  133. #Populate dictionary with master set of fasta records
  134. for seq_record in SeqIO.parse(inFasta, "fasta"):
  135. SeqMaster[seq_record.id]=str(seq_record.seq)
  136. return SeqMaster
  137. def mainArgs():
  138. """Process command-line arguments"""
  139. parser = argparse.ArgumentParser(description='Takes as input: a list of target cluster names, \
  140. transcript-to-cluster mapping file generated by corset, and a fasta file containing transcript \
  141. sequences. Returns: Fasta file of transcripts belonging to target clusters, with Cluster ID \
  142. appended to their original sequence name.', prog='fetchClusterSeqs')
  143. parser.add_argument("-i","--inFasta",
  144. required=True,
  145. help="Multi fasta to extract subset from")
  146. parser.add_argument("-t","--targetClust",
  147. default=None,
  148. help="Text file with target cluster names in column one. If not provided \
  149. will return transcripts for all clusters.")
  150. parser.add_argument("-o","--outFasta",
  151. default='filtered_seqs.fa',
  152. help="Directory for new sequence file to be written to.")
  153. parser.add_argument("-c","--clustMap",
  154. required=True,
  155. help="Corset transcript-to-cluster mapping file.")
  156. parser.add_argument("-l","--longest",
  157. action='store_true',
  158. default=False,
  159. help="If set, return only longest transcript per cluster.")
  160. args = parser.parse_args()
  161. return args
  162. if __name__== '__main__':
  163. args = mainArgs()
  164. main(inFasta=args.inFasta, targetClust=args.targetClust, outFasta=args.outFasta, clustMap=args.clustMap, getLong=args.longest)