Assembly.snake 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. import os
  2. ######################################################
  3. # RNA-Seq Denovo Assembly
  4. ######################################################
  5. Data_Dir = config["Data_Dir"]
  6. Assembly_Dir = config["Assembly_Dir"]
  7. TRINITY_HOME= config["TRINITY_HOME"]
  8. SAMPLES_FILE = config["SAMPLES_FILE"]
  9. trinity_max_memory = config["trinity_max_memory"]
  10. busco_database = config["busco_database"]
  11. localrules: trinity_stat, fetchClusterSeqs
  12. rule trinity:
  13. input:
  14. fq = expand(Data_Dir +"/{sample}_{pair}.fq.gz", sample=SAMPLES, pair=[1, 2]),
  15. output:
  16. Assembly_Dir + "/trinity_out_dir.Trinity.fasta",
  17. Assembly_Dir + "/trinity_out_dir.Trinity.fasta.gene_trans_map"
  18. threads: 22
  19. shell:
  20. "{TRINITY_HOME}/Trinity --seqType fq" +
  21. " --output {Assembly_Dir}/trinity_out_dir" +
  22. " --max_memory {trinity_max_memory}" +
  23. " --samples_file {SAMPLES_FILE}" +
  24. " --CPU {threads}" +
  25. " --full_cleanup"
  26. rule trinity_stat:
  27. input:
  28. Assembly_Dir + "/trinity_out_dir.Trinity.fasta"
  29. output:
  30. Assembly_Dir + "/trinity_out_dir.Trinity.fasta.txt"
  31. shell:
  32. "{TRINITY_HOME}/util/TrinityStats.pl "
  33. "{input} >{output} "
  34. rule salmon_index:
  35. input:
  36. Assembly_Dir + "/trinity_out_dir.Trinity.fasta"
  37. output:
  38. index_log = Assembly_Dir + "/trinity_out_dir.Trinity/indexing.log"
  39. run:
  40. salmon_index_dir = os.path.dirname(output.index_log)
  41. shell("salmon index --index {salmon_index_dir} --transcripts {input}")
  42. rule salmon_quant:
  43. input:
  44. index_log = Assembly_Dir +"/trinity_out_dir.Trinity/indexing.log",
  45. left = Data_Dir + "/{sample}_1.fq.gz",
  46. right = Data_Dir + "/{sample}_2.fq.gz"
  47. output:
  48. Assembly_Dir + "/{sample}.out/aux_info/eq_classes.txt"
  49. run:
  50. salmon_index_dir = os.path.dirname(input.index_log)
  51. shell("salmon quant --index {salmon_index_dir} -l A --dumpEq -1 {input.left} -2 {input.right} --output {Assembly_Dir}/{wildcards.sample}.out")
  52. rule corset:
  53. input:
  54. expand(Assembly_Dir + "/{sample}.out/aux_info/eq_classes.txt", sample=SAMPLES)
  55. output:
  56. Assembly_Dir + "/corset-clusters.txt",
  57. Assembly_Dir + "/corset-counts.txt"
  58. shell:
  59. "corset -g " + ",".join(SAMPLES.values()) +
  60. " -n " + ",".join(SAMPLES.keys()) +
  61. " -i salmon_eq_classes " +
  62. " -p {Assembly_Dir}/corset" +
  63. " {input}"
  64. rule corset_trans_map:
  65. input:
  66. clusters = Assembly_Dir + "/corset-clusters.txt",
  67. transcripts = Assembly_Dir + "/trinity_out_dir.Trinity.fasta"
  68. output:
  69. all_cluster = Assembly_Dir + "/corset-all_clusters.txt",
  70. clusters_trans_map = Assembly_Dir + "/corset-clusters_trans_map.txt"
  71. run:
  72. fi = open(input.clusters, 'r')
  73. fo1 = open(output.all_cluster, 'w+')
  74. fo2 = open(output.clusters_trans_map, 'w+')
  75. trans2clust = {}
  76. for line in fi:
  77. line = line.strip()
  78. (trans, clust) = line.split('\t')
  79. trans2clust[trans] = clust
  80. fo2.write(clust + "\t" + clust + "_" + trans + "\n")
  81. fo1.write('\n'.join(set(trans2clust.values())))
  82. fi.close()
  83. fo1.close()
  84. fo2.close()
  85. rule fetchClusterSeqs:
  86. input:
  87. transcripts = Assembly_Dir + "/trinity_out_dir.Trinity.fasta",
  88. all_cluster = Assembly_Dir + "/corset-all_clusters.txt",
  89. clusters = Assembly_Dir + "/corset-clusters.txt",
  90. clusters_trans_map = Assembly_Dir + "/corset-clusters_trans_map.txt"
  91. output:
  92. corset_fa = Assembly_Dir + "/corset.fasta",
  93. corset_longest_fa = Assembly_Dir + "/corset_longest.fasta",
  94. corset_fa_stat = Assembly_Dir + "/corset.fasta.txt"
  95. shell:
  96. "python tools/Corset-tools/fetchClusterSeqs.py"
  97. " -i {input.transcripts} "
  98. " -t {input.all_cluster} "
  99. " -o {output.corset_fa} "
  100. " -c {input.clusters} ;"
  101. "python2 tools/Corset-tools/fetchClusterSeqs.py"
  102. " -l"
  103. " -i {input.transcripts} "
  104. " -t {input.all_cluster} "
  105. " -o temp.fasta "
  106. " -c {input.clusters} ;"
  107. "perl script/FastA.rename.pl {input.clusters_trans_map} temp.fasta >{output.corset_longest_fa};"
  108. "perl script/CorsetStats.pl {output.corset_fa} > {output.corset_fa_stat}"
  109. #rule lace:
  110. # input:
  111. # transcripts = Assembly_Dir + "/trinity_out_dir.Trinity.fasta",
  112. # clusters = Assembly_Dir + "/corset-clusters.txt",
  113. # output:
  114. # Assembly_Dir + "/SuperDuper.fasta"
  115. # threads: 10
  116. # shell:
  117. # "Lace.py {input.transcripts} {input.clusters} -t --cores {threads} -o {Assembly_Dir}"
  118. rule busco:
  119. input:
  120. corset_longest_fa = Assembly_Dir + "/corset_longest.fasta"
  121. output:
  122. Assembly_Dir + "/run_busco/short_summary_busco.txt"
  123. shell:
  124. "run_busco -i {input} -m tran -l {busco_database} -o busco;"
  125. "mv run_busco {Assembly_Dir}/"