|
@@ -1,5 +1,6 @@
|
|
|
import os
|
|
|
import yaml
|
|
|
+import pandas as pd
|
|
|
|
|
|
configfile: "config.yaml"
|
|
|
##########################################################
|
|
@@ -7,6 +8,7 @@ configfile: "config.yaml"
|
|
|
##########################################################
|
|
|
## 软件主目录
|
|
|
PEAKSNAKE_HOME = config["PEAKSNAKE_HOME"] if config["PEAKSNAKE_HOME"] else os.getcwd()
|
|
|
+## 关键参数
|
|
|
PEAK_TYPE = config["peak_type"]
|
|
|
SEQ_TYPE = config["seq_type"]
|
|
|
PEAK_SELECTION = config["peak_selection"]
|
|
@@ -19,6 +21,7 @@ REPLICATE_TO_INPUT = {k: config['samples'][k] for k in sorted(config['samples'])
|
|
|
|
|
|
# 列表:所有 REPLICATES
|
|
|
REPLICATES = sorted(list(set(REPLICATE_TO_INPUT.keys())))
|
|
|
+
|
|
|
# 列表:所有 INPUTS
|
|
|
INPUTS = sorted(list(set(REPLICATE_TO_INPUT.values())))
|
|
|
INPUTS = [] if INPUTS == [None] else INPUTS
|
|
@@ -65,7 +68,12 @@ def get_motifs_from_meme(meme_file):
|
|
|
for line in file:
|
|
|
if line.startswith("MOTIF"):
|
|
|
parts = line.split()
|
|
|
- motif_name = parts[2] + "_" + parts[1] # 第二列和第三列的组合
|
|
|
+ # 检查是否有三列,且第三列不是空字符串
|
|
|
+ if len(parts) == 3 and parts[2]:
|
|
|
+ motif_name = parts[2] + "_" + parts[1] # 第三列和第二列的组合
|
|
|
+ # 检查是否有两列或第三列为空
|
|
|
+ elif len(parts) >= 2:
|
|
|
+ motif_name = "_" + parts[1] # 只使用第二列
|
|
|
motifs.append(motif_name)
|
|
|
return motifs
|
|
|
|
|
@@ -74,8 +82,14 @@ rule all:
|
|
|
#####################################
|
|
|
# 从 fastq 到 peaks
|
|
|
#####################################
|
|
|
+ # 测序数据质控表格
|
|
|
+ "quality_control/fastp_summary.tsv",
|
|
|
# 最终比对结果
|
|
|
expand("clean_bams/{replicate}_final.bam", replicate = REPLICATES + INPUTS),
|
|
|
+ # 比对质控表格
|
|
|
+ "quality_control/estimate_read_filtering.tsv",
|
|
|
+ # cross correlation 质控表格
|
|
|
+ "quality_control/cross_correlation_summary.tsv" if config["cross_correlation"]["do"] else [],
|
|
|
# bw 文件, deeptools 可视化
|
|
|
expand("clean_bams/{replicate}.bw", replicate = REPLICATES + INPUTS),
|
|
|
"deeptools/sample_correlation.pdf" if len(REPLICATES) > 2 else [],
|
|
@@ -92,34 +106,27 @@ rule all:
|
|
|
# 提取序列
|
|
|
expand("clean_peaks/{m}/{sample}_peaks.fa", sample=SAMPLE_TO_REPLICATE.keys(), m = PEAK_SELECTION),
|
|
|
# 所有样本共识peaks
|
|
|
- "clean_peaks/merge/merged_peaks.bed" if len(SAMPLE_TO_REPLICATE) > 2 else [],
|
|
|
- #####################################
|
|
|
- # Peak 注释
|
|
|
- #####################################
|
|
|
- expand("peak_annotation/{sample}_peaks_allhits.txt", sample = SAMPLE_TO_REPLICATE.keys()) if config["uropa"]["do"] else [],
|
|
|
- expand("peak_annotation/{sample}_peaks_finalhits.txt", sample = SAMPLE_TO_REPLICATE.keys()) if config["uropa"]["do"] else [],
|
|
|
+ "clean_peaks/merge/merged_peaks.bed",
|
|
|
#####################################
|
|
|
# 定量分析
|
|
|
#####################################
|
|
|
"counts/merged_peaks.counts.matrix",
|
|
|
"counts/merged_peaks.TMM.CPM.matrix",
|
|
|
- # 差异分析
|
|
|
+ #####################################
|
|
|
+ # Peak 注释
|
|
|
+ #####################################
|
|
|
+ # 对单个样本 peaks 注释
|
|
|
+ expand("peak_annotation/{sample}_peaks_{hit}.txt", sample = SAMPLE_TO_REPLICATE.keys(), hit = ["allhits", "finalhits"]) if config["uropa"]["do"] else [],
|
|
|
+ # 对 merge 之后的 peaks 注释
|
|
|
+ expand("peak_annotation/merged_peaks_{hit}.txt", hit = ["allhits", "finalhits"]) if config["uropa"]["do"] else [],
|
|
|
+ #####################################
|
|
|
+ # 差异分析
|
|
|
+ #####################################
|
|
|
expand("diff_peaks/{contrast}.{m}.DE_results", contrast=config["contrasts"], m=config["diff_peaks"]["method"]) if config["diff_peaks"]["do"] and config["contrasts"] else [],
|
|
|
- expand("diff_peaks_annotation/{contrast}.bed6", contrast=config["contrasts"]) if config["diff_peaks"]["do"] and config["contrasts"] and config["uropa"]["do"] else [],
|
|
|
- #####################################
|
|
|
- # 质控结果
|
|
|
- #####################################
|
|
|
- # 测序数据质控表格
|
|
|
- "quality_control/fastp_summary.tsv",
|
|
|
- # 比对质控表格
|
|
|
- "quality_control/estimate_read_filtering.tsv",
|
|
|
- # cross correlation 质控表格
|
|
|
- "quality_control/cross_correlation_summary.tsv" if config["cross_correlation"]["do"] else [],
|
|
|
#####################################
|
|
|
# 足迹分析
|
|
|
#####################################
|
|
|
- "footprinting/bindetect_results.txt",
|
|
|
- expand("footprinting/{motif}/{motif}_Plot{plot}.pdf", motif=get_motifs_from_meme("motif_databases/JASPAR/JASPAR2022_CORE_plants_non-redundant_v2.meme"), plot=["Aggregate", "Heatmap"])
|
|
|
+ "footprinting/bindetect_results.txt" if config["footprint"]["do"] else []
|
|
|
|
|
|
##########################################################
|
|
|
# 使用fastp进行原始数据质控和过滤,自动判断 SE?PE?
|
|
@@ -243,7 +250,7 @@ rule estimate_read_filtering:
|
|
|
--ignoreDuplicates \
|
|
|
--minMappingQuality {params.minMappingQuality} \
|
|
|
--samFlagExclude 260 \
|
|
|
- --blackListFileName ref/blacklist.bed \
|
|
|
+ {params.blacklist} \
|
|
|
{params.extra} 1>{log} 2>&1
|
|
|
"""
|
|
|
|
|
@@ -485,14 +492,14 @@ rule select_peaks_by_idr:
|
|
|
##########################################################
|
|
|
rule extract_peak_sequence:
|
|
|
input:
|
|
|
- Peak="clean_peaks/" + PEAK_SELECTION + "/{sample}_peaks." + PEAK_TYPE + "Peak"
|
|
|
+ Peak="clean_peaks/{m}/{sample}_peaks." + PEAK_TYPE + "Peak"
|
|
|
output:
|
|
|
- peak_fa="clean_peaks/" + PEAK_SELECTION + "/{sample}_peaks.fa"
|
|
|
+ peak_fa="clean_peaks/{m}/{sample}_peaks.fa"
|
|
|
params:
|
|
|
top_n=config["motif"]["top_n"],
|
|
|
summit_flank=config["motif"]["summit_flank"],
|
|
|
genome=config["genome"],
|
|
|
- summit_fa="clean_peaks/" + PEAK_SELECTION + "/{sample}_summit.fa"
|
|
|
+ summit_fa="clean_peaks/{m}/{sample}_summit.fa"
|
|
|
shell:
|
|
|
"""
|
|
|
set +o pipefail;
|
|
@@ -548,7 +555,27 @@ rule merge_peaks:
|
|
|
cat {input.sample_peaks} > clean_peaks/merge/cat_peaks.bed
|
|
|
bedtools sort -i clean_peaks/merge/cat_peaks.bed -g {params.fai} > clean_peaks/merge/cat_sorted_peaks.bed
|
|
|
bedtools merge -i clean_peaks/merge/cat_sorted_peaks.bed > {output.merged_peaks_bed}
|
|
|
- awk 'OFS="\t" {{print $1":"$2+1"-"$3, $1, $2+1, $3, "."}}' {output.merged_peaks_bed} > {output.merged_peaks_saf}
|
|
|
+ awk 'OFS="\t" {{print $1":"$2"-"$3, $1, $2+1, $3, "."}}' {output.merged_peaks_bed} > {output.merged_peaks_saf}
|
|
|
+ """
|
|
|
+
|
|
|
+rule merged_peak_annotation_with_uropa:
|
|
|
+ input:
|
|
|
+ Peak="clean_peaks/merge/merged_peaks.bed",
|
|
|
+ gtf=config["gtf"]
|
|
|
+ output:
|
|
|
+ allhits="peak_annotation/merged_peaks_allhits.txt",
|
|
|
+ finalhits="peak_annotation/merged_peaks_finalhits.txt"
|
|
|
+ log: "logs/merged_peak_annotation_with_uropa.log"
|
|
|
+ params:
|
|
|
+ feature=config["uropa"]["feature"],
|
|
|
+ feature_anchor=config["uropa"]["feature_anchor"],
|
|
|
+ distance=config["uropa"]["distance"],
|
|
|
+ relative_location=config["uropa"]["relative_location"]
|
|
|
+ singularity:
|
|
|
+ PEAKSNAKE_HOME + "/sifs/uropa_4.0.3--pyhdfd78af_0.sif"
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ uropa --bed {input.Peak} --gtf {input.gtf} --feature {params.feature} --feature-anchor {params.feature_anchor} --distance {params.distance} --relative-location {params.relative_location} --show-attributes gene_id --outdir peak_annotation 1>{log} 2>&1
|
|
|
"""
|
|
|
|
|
|
##########################################################
|
|
@@ -610,17 +637,19 @@ rule run_deseq2:
|
|
|
output:
|
|
|
expand("diff_peaks/{contrast}.DESeq2.DE_results", contrast=config["contrasts"])
|
|
|
log: "logs/run_deseq2.log"
|
|
|
+ params:
|
|
|
+ extra=config["diff_peaks"]["extra"]
|
|
|
singularity:
|
|
|
PEAKSNAKE_HOME + "/sifs/trinity_20231218.sif"
|
|
|
shell:
|
|
|
"""
|
|
|
- run_DE_analysis.pl -m {input} --method DESeq2 -s samples.txt --contrasts contrasts.txt -o diff_peaks 1>{log} 2>&1
|
|
|
+ run_DE_analysis.pl -m {input} --method DESeq2 -s samples.txt --contrasts contrasts.txt -o diff_peaks {params.extra} 1>{log} 2>&1
|
|
|
|
|
|
cd diff_peaks/
|
|
|
rm *count_matrix *.MA_n_Volcano.pdf *.Rscript
|
|
|
for file in merged_peaks.counts.matrix.*.DE_results; do
|
|
|
- echo -e "PeakID\tsampleA\tsampleB\tFoldChange\tPValue\tPadj" >"${{file#merged_peaks.counts.matrix.}}"
|
|
|
- grep -v "^sampleA" $file | awk 'BEGIN {{OFS="\t"}} {{ $7 = exp(log(2) * $7); print $1,$2,$3,$7,$10,$11}}' >> "${{file#merged_peaks.counts.matrix.}}"
|
|
|
+ echo -e "PeakID\tsampleA\tsampleB\tlog2FoldChange\tPValue\tPadj" >"${{file#merged_peaks.counts.matrix.}}"
|
|
|
+ grep -v "^sampleA" $file | awk 'BEGIN {{OFS="\t"}} {{print $1,$2,$3,$7,$10,$11}}' >> "${{file#merged_peaks.counts.matrix.}}"
|
|
|
rm $file
|
|
|
done
|
|
|
cd ..
|
|
@@ -632,45 +661,24 @@ rule run_edgeR:
|
|
|
output:
|
|
|
expand("diff_peaks/{contrast}.edgeR.DE_results", contrast=config["contrasts"])
|
|
|
log: "logs/run_edgeR.log"
|
|
|
+ params:
|
|
|
+ extra=config["diff_peaks"]["extra"]
|
|
|
singularity:
|
|
|
PEAKSNAKE_HOME + "/sifs/trinity_20231218.sif"
|
|
|
shell:
|
|
|
"""
|
|
|
- run_DE_analysis.pl -m {input} --method edgeR -s samples.txt --contrasts contrasts.txt -o diff_peaks 1>{log} 2>&1
|
|
|
+ run_DE_analysis.pl -m {input} --method edgeR -s samples.txt --contrasts contrasts.txt -o diff_peaks {params.extra} 1>{log} 2>&1
|
|
|
|
|
|
cd diff_peaks/
|
|
|
rm *count_matrix *.MA_n_Volcano.pdf *.Rscript
|
|
|
for file in merged_peaks.counts.matrix.*.DE_results; do
|
|
|
- echo -e "PeakID\tsampleA\tsampleB\tFoldChange\tPValue\tPadj" >"${{file#merged_peaks.counts.matrix.}}"
|
|
|
- grep -v "^sampleA" $file | awk 'BEGIN {{OFS="\t"}} {{ $4 = exp(log(2) * $4); print $1,$2,$3,$4,$6,$7}}' >> "${{file#merged_peaks.counts.matrix.}}"
|
|
|
+ echo -e "PeakID\tsampleA\tsampleB\tlog2FoldChange\tPValue\tPadj" >"${{file#merged_peaks.counts.matrix.}}"
|
|
|
+ grep -v "^sampleA" $file | awk 'BEGIN {{OFS="\t"}} {{print $1,$2,$3,$4,$6,$7}}' >> "${{file#merged_peaks.counts.matrix.}}"
|
|
|
rm $file
|
|
|
done
|
|
|
cd ..
|
|
|
"""
|
|
|
|
|
|
-rule diff_peaks_annotation_with_uropa:
|
|
|
- input:
|
|
|
- diff_peaks="diff_peaks/{contrast}.edgeR.DE_results",
|
|
|
- gtf=config["gtf"]
|
|
|
- output:
|
|
|
- diff_peaks_bed="diff_peaks_annotation/{contrast}.bed6",
|
|
|
- allhits="diff_peaks_annotation/{contrast}_allhits.txt",
|
|
|
- finalhits="diff_peaks_annotation/{contrast}_finalhits.txt"
|
|
|
- log: "logs/{contrast}_diff_peaks_annotation_with_uropa.log"
|
|
|
- params:
|
|
|
- diff_peaks_padj=config["diff_peaks"]["padj"],
|
|
|
- feature_anchor=config["uropa"]["feature_anchor"],
|
|
|
- distance=config["uropa"]["distance"],
|
|
|
- relative_location=config["uropa"]["relative_location"]
|
|
|
- singularity:
|
|
|
- PEAKSNAKE_HOME + "/sifs/uropa_4.0.3--pyhdfd78af_0.sif"
|
|
|
- shell:
|
|
|
- """
|
|
|
- awk 'BEGIN {{FS="\\t"; OFS="\t"}} NR>1 && $6 < {params.diff_peaks_padj} {{split($1, arr, "[:-]"); print arr[1], arr[2]-1, arr[3], $1, $4, "."}}' {input.diff_peaks} > {output.diff_peaks_bed}
|
|
|
-
|
|
|
- uropa --bed {output.diff_peaks_bed} --gtf {input.gtf} --feature gene --feature-anchor {params.feature_anchor} --distance {params.distance} --relative-location {params.relative_location} --outdir diff_peaks_annotation 1>{log} 2>&1
|
|
|
- """
|
|
|
-
|
|
|
##########################################################
|
|
|
# 合并fastp质控结果生成表格
|
|
|
##########################################################
|
|
@@ -716,7 +724,7 @@ rule ATACorrect:
|
|
|
PEAKSNAKE_HOME + "/sifs/tobias_20231231.sif"
|
|
|
shell:
|
|
|
"""
|
|
|
- TOBIAS ATACorrect --window 10 --bam {input.bam} --genome {input.genome} --peaks {input.merged_peaks_bed} --outdir footprinting --prefix {params.prefix} --cores {threads} 1>{log} 2>&1
|
|
|
+ TOBIAS ATACorrect --bam {input.bam} --genome {input.genome} --peaks {input.merged_peaks_bed} --outdir footprinting --prefix {params.prefix} --cores {threads} 1>{log} 2>&1
|
|
|
"""
|
|
|
|
|
|
rule FootprintScores:
|
|
@@ -733,7 +741,7 @@ rule FootprintScores:
|
|
|
PEAKSNAKE_HOME + "/sifs/tobias_20231231.sif"
|
|
|
shell:
|
|
|
"""
|
|
|
- TOBIAS FootprintScores --window 10 --signal {input.bw} --regions {input.merged_peaks_bed} --output {output.bw} --cores {threads} 1>{log} 2>&1
|
|
|
+ TOBIAS ScoreBigwig --signal {input.bw} --regions {input.merged_peaks_bed} --output {output.bw} --cores {threads} 1>{log} 2>&1
|
|
|
"""
|
|
|
|
|
|
rule BINDetect:
|
|
@@ -742,32 +750,19 @@ rule BINDetect:
|
|
|
merged_peaks_bed="clean_peaks/merge/merged_peaks.bed",
|
|
|
genome=config["genome"]
|
|
|
output:
|
|
|
- "footprinting/bindetect_results.txt"
|
|
|
+ "footprinting/bindetect_results.txt",
|
|
|
+ expand("footprinting/{motif}/beds/{motif}_all.bed", motif=get_motifs_from_meme(config["footprint"]["motif"])),
|
|
|
+ expand("footprinting/{motif}/beds/{motif}_{sample}_{b}.bed", motif=get_motifs_from_meme(config["footprint"]["motif"]), sample=SAMPLE_TO_REPLICATE.keys(), b=["bound", "unbound"])
|
|
|
params:
|
|
|
- samples=" ".join(SAMPLE_TO_REPLICATE.keys())
|
|
|
+ samples=" ".join(SAMPLE_TO_REPLICATE.keys()),
|
|
|
+ motif_databases=config["footprint"]["motif"]
|
|
|
threads: 8
|
|
|
log: "logs/BINDetect.log"
|
|
|
singularity:
|
|
|
PEAKSNAKE_HOME + "/sifs/tobias_20231231.sif"
|
|
|
shell:
|
|
|
"""
|
|
|
- TOBIAS BINDetect --motifs motif_databases/JASPAR/JASPAR2022_CORE_plants_non-redundant_v2.meme --signals {input.bws} --genome {input.genome} --peaks {input.merged_peaks_bed} --outdir footprinting --cond_names {params.samples} --cores {threads}
|
|
|
- """
|
|
|
-
|
|
|
-rule plot_per_motif:
|
|
|
- input:
|
|
|
- motif_bed="footprinting/{motif}/beds/{motif}_all.bed",
|
|
|
- bws=expand("footprinting/{sample}_corrected.bw", sample=SAMPLE_TO_REPLICATE.keys())
|
|
|
- output:
|
|
|
- aggregate_pdf="footprinting/{motif}/{motif}_PlotAggregate.pdf",
|
|
|
- heatmap_pdf="footprinting/{motif}/{motif}_PlotHeatmap.pdf"
|
|
|
- threads: 1
|
|
|
- singularity:
|
|
|
- PEAKSNAKE_HOME + "/sifs/tobias_20231231.sif"
|
|
|
- shell:
|
|
|
- """
|
|
|
- TOBIAS PlotAggregate --TFBS {input.motif_bed} --signals {input.bws} --output {output.aggregate_pdf} --share_y both --plot_boundaries --signal-on-x
|
|
|
- TOBIAS PlotHeatmap --TFBS {input.motif_bed} --signals {input.bws} --output {output.heatmap_pdf} --sort_by -2
|
|
|
+ TOBIAS BINDetect --motifs {params.motif_databases} --signals {input.bws} --genome {input.genome} --peaks {input.merged_peaks_bed} --outdir footprinting --cond_names {params.samples} --cores {threads} 1>{log} 2>&1
|
|
|
"""
|
|
|
|
|
|
##########################################################
|