123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136 |
- import os
- ######################################################
- # RNA-Seq Denovo Assembly
- ######################################################
- Data_Dir = config["Data_Dir"]
- Assembly_Dir = config["Assembly_Dir"]
- TRINITY_HOME= config["TRINITY_HOME"]
- SAMPLES_FILE = config["SAMPLES_FILE"]
- trinity_max_memory = config["trinity_max_memory"]
- busco_database = config["busco_database"]
- localrules: trinity_stat, fetchClusterSeqs
- rule trinity:
- input:
- fq = expand(Data_Dir +"/{sample}_{pair}.fq.gz", sample=SAMPLES, pair=[1, 2]),
- output:
- Assembly_Dir + "/trinity_out_dir.Trinity.fasta",
- Assembly_Dir + "/trinity_out_dir.Trinity.fasta.gene_trans_map"
- threads: 22
- shell:
- "{TRINITY_HOME}/Trinity --seqType fq" +
- " --output {Assembly_Dir}/trinity_out_dir" +
- " --max_memory {trinity_max_memory}" +
- " --samples_file {SAMPLES_FILE}" +
- " --CPU {threads}" +
- " --full_cleanup"
- rule trinity_stat:
- input:
- Assembly_Dir + "/trinity_out_dir.Trinity.fasta"
- output:
- Assembly_Dir + "/trinity_out_dir.Trinity.fasta.txt"
- shell:
- "{TRINITY_HOME}/util/TrinityStats.pl "
- "{input} >{output} "
- rule salmon_index:
- input:
- Assembly_Dir + "/trinity_out_dir.Trinity.fasta"
- output:
- index_log = Assembly_Dir + "/trinity_out_dir.Trinity/indexing.log"
- run:
- salmon_index_dir = os.path.dirname(output.index_log)
- shell("salmon index --index {salmon_index_dir} --transcripts {input}")
- rule salmon_quant:
- input:
- index_log = Assembly_Dir +"/trinity_out_dir.Trinity/indexing.log",
- left = Data_Dir + "/{sample}_1.fq.gz",
- right = Data_Dir + "/{sample}_2.fq.gz"
- output:
- Assembly_Dir + "/{sample}.out/aux_info/eq_classes.txt"
- run:
- salmon_index_dir = os.path.dirname(input.index_log)
- shell("salmon quant --index {salmon_index_dir} -l A --dumpEq -1 {input.left} -2 {input.right} --output {Assembly_Dir}/{wildcards.sample}.out")
- rule corset:
- input:
- expand(Assembly_Dir + "/{sample}.out/aux_info/eq_classes.txt", sample=SAMPLES)
- output:
- Assembly_Dir + "/corset-clusters.txt",
- Assembly_Dir + "/corset-counts.txt"
- shell:
- "corset -g " + ",".join(SAMPLES.values()) +
- " -n " + ",".join(SAMPLES.keys()) +
- " -i salmon_eq_classes " +
- " -p {Assembly_Dir}/corset" +
- " {input}"
- rule corset_trans_map:
- input:
- clusters = Assembly_Dir + "/corset-clusters.txt",
- transcripts = Assembly_Dir + "/trinity_out_dir.Trinity.fasta"
- output:
- all_cluster = Assembly_Dir + "/corset-all_clusters.txt",
- clusters_trans_map = Assembly_Dir + "/corset-clusters_trans_map.txt"
- run:
- fi = open(input.clusters, 'r')
- fo1 = open(output.all_cluster, 'w+')
- fo2 = open(output.clusters_trans_map, 'w+')
- trans2clust = {}
- for line in fi:
- line = line.strip()
- (trans, clust) = line.split('\t')
- trans2clust[trans] = clust
- fo2.write(clust + "\t" + clust + "_" + trans + "\n")
- fo1.write('\n'.join(set(trans2clust.values())))
- fi.close()
- fo1.close()
- fo2.close()
- rule fetchClusterSeqs:
- input:
- transcripts = Assembly_Dir + "/trinity_out_dir.Trinity.fasta",
- all_cluster = Assembly_Dir + "/corset-all_clusters.txt",
- clusters = Assembly_Dir + "/corset-clusters.txt",
- clusters_trans_map = Assembly_Dir + "/corset-clusters_trans_map.txt"
- output:
- corset_fa = Assembly_Dir + "/corset.fasta",
- corset_longest_fa = Assembly_Dir + "/corset_longest.fasta",
- corset_fa_stat = Assembly_Dir + "/corset.fasta.txt"
- shell:
- "python tools/Corset-tools/fetchClusterSeqs.py"
- " -i {input.transcripts} "
- " -t {input.all_cluster} "
- " -o {output.corset_fa} "
- " -c {input.clusters} ;"
- "python2 tools/Corset-tools/fetchClusterSeqs.py"
- " -l"
- " -i {input.transcripts} "
- " -t {input.all_cluster} "
- " -o temp.fasta "
- " -c {input.clusters} ;"
- "perl script/FastA.rename.pl {input.clusters_trans_map} temp.fasta >{output.corset_longest_fa};"
- "perl script/CorsetStats.pl {output.corset_fa} > {output.corset_fa_stat}"
- #rule lace:
- # input:
- # transcripts = Assembly_Dir + "/trinity_out_dir.Trinity.fasta",
- # clusters = Assembly_Dir + "/corset-clusters.txt",
- # output:
- # Assembly_Dir + "/SuperDuper.fasta"
- # threads: 10
- # shell:
- # "Lace.py {input.transcripts} {input.clusters} -t --cores {threads} -o {Assembly_Dir}"
- rule busco:
- input:
- corset_longest_fa = Assembly_Dir + "/corset_longest.fasta"
- output:
- Assembly_Dir + "/run_busco/short_summary_busco.txt"
- shell:
- "run_busco -i {input} -m tran -l {busco_database} -o busco;"
- "mv run_busco {Assembly_Dir}/"
|