|
@@ -0,0 +1,716 @@
|
|
|
+import os
|
|
|
+import yaml
|
|
|
+
|
|
|
+configfile: "config.yaml"
|
|
|
+##########################################################
|
|
|
+# 全局变量和样本信息
|
|
|
+##########################################################
|
|
|
+## 软件主目录
|
|
|
+PEAKSNAKE_HOME = config["PEAKSNAKE_HOME"] if config["PEAKSNAKE_HOME"] else os.getcwd()
|
|
|
+PEAK_TYPE = config["peak_type"]
|
|
|
+SEQ_TYPE = config["seq_type"]
|
|
|
+PEAK_SELECTION = config["peak_selection"]
|
|
|
+GENOME = config["genome"]
|
|
|
+GTF = config["gtf"]
|
|
|
+
|
|
|
+## 样本信息变量
|
|
|
+# 字典:REPLICATE to INPUT
|
|
|
+REPLICATE_TO_INPUT = {k: config['samples'][k] for k in sorted(config['samples'])}
|
|
|
+
|
|
|
+# 列表:所有 REPLICATES
|
|
|
+REPLICATES = sorted(list(set(REPLICATE_TO_INPUT.keys())))
|
|
|
+# 列表:所有 INPUTS
|
|
|
+INPUTS = sorted(list(set(REPLICATE_TO_INPUT.values())))
|
|
|
+INPUTS = [] if INPUTS == [None] else INPUTS
|
|
|
+
|
|
|
+# 字典: SAMPLE 与 rep1 rep2 rep2 对应关系
|
|
|
+SAMPLE_TO_REPLICATE = {}
|
|
|
+for s in REPLICATES:
|
|
|
+ name, rep = '_'.join(s.split('_')[:-1]), s.split('_')[-1]
|
|
|
+ SAMPLE_TO_REPLICATE.setdefault(name, []).append(rep)
|
|
|
+
|
|
|
+SAMPLE_TO_REPLICATE = {k: sorted(v) for k, v in SAMPLE_TO_REPLICATE.items()}
|
|
|
+
|
|
|
+## 生成样本信息表
|
|
|
+with open("sample_sheet.csv", 'w') as f:
|
|
|
+ f.write("SampleID,ControlID,Tissue,Factor,Condition,Treatment,Replicate,bamReads,Peaks,bamControl,PeakCaller\n")
|
|
|
+
|
|
|
+ for sample, control in REPLICATE_TO_INPUT.items():
|
|
|
+ sample_parts = sample.split('_')
|
|
|
+ factor = sample_parts[0] # 提取样本ID中的Factor
|
|
|
+ tissue = "NA"
|
|
|
+ treatment = sample_parts[1] # 在这个例子中,tissue和condition是相同的
|
|
|
+ condition = factor + "_" + treatment
|
|
|
+ replicate = sample_parts[2].replace("rep", "") # 将"rep1"和"rep2"转换为"1"和"2"
|
|
|
+
|
|
|
+ if control:
|
|
|
+ control_parts = control.split('_')
|
|
|
+ control_id = "_".join(control_parts[:2]) # 构建 ControlID
|
|
|
+ bamControl = f"clean_bams/{control}_final.bam"
|
|
|
+ else:
|
|
|
+ control_id = "NA"
|
|
|
+ bamControl = "NA"
|
|
|
+
|
|
|
+ bamReads = f"clean_bams/{sample}_final.bam"
|
|
|
+ peaks = f"clean_peaks/cutoff/{sample}_peaks.{PEAK_TYPE}Peak"
|
|
|
+
|
|
|
+ f.write(f"{sample},{control_id},{tissue},{factor},{condition},{treatment},{replicate},{bamReads},{peaks},{bamControl},bed\n")
|
|
|
+
|
|
|
+##########################################################
|
|
|
+# rule all: 最终想要生成的文件
|
|
|
+##########################################################
|
|
|
+rule all:
|
|
|
+ input:
|
|
|
+ #####################################
|
|
|
+ # 从 fastq 到 peaks
|
|
|
+ #####################################
|
|
|
+ # 最终比对结果
|
|
|
+ expand("clean_bams/{replicate}_final.bam", replicate = REPLICATES + INPUTS),
|
|
|
+ # bw 文件, deeptools 可视化
|
|
|
+ expand("clean_bams/{replicate}.bw", replicate = REPLICATES + INPUTS),
|
|
|
+ "deeptools/sample_correlation.pdf" if len(REPLICATES) > 2 else [],
|
|
|
+ "deeptools/tss_heatmap.pdf",
|
|
|
+ "deeptools/tss_tes_heatmap.pdf",
|
|
|
+ # raw peak 结果
|
|
|
+ expand("raw_peaks/{replicate}_peaks.{PEAK_TYPE}Peak", replicate = REPLICATES, PEAK_TYPE = PEAK_TYPE),
|
|
|
+ # cutoff analysis
|
|
|
+ expand("raw_peaks/{replicate}_cutoff_analysis.pdf", replicate = REPLICATES),
|
|
|
+ # clean peak 结果
|
|
|
+ expand("clean_peaks/cutoff/{replicate}_peaks.{PEAK_TYPE}Peak", replicate = REPLICATES, PEAK_TYPE = PEAK_TYPE),
|
|
|
+ # 通过 intersect or idr 进行 peak 筛选
|
|
|
+ expand("clean_peaks/{m}/{sample}_peaks.{PEAK_TYPE}Peak", sample=SAMPLE_TO_REPLICATE.keys(), m = PEAK_SELECTION, PEAK_TYPE = PEAK_TYPE),
|
|
|
+ # 所有样本共识peaks
|
|
|
+ "clean_peaks/merge/merged_peaks.bed" if len(SAMPLE_TO_REPLICATE) > 2 else [],
|
|
|
+ #####################################
|
|
|
+ # Motif 分析
|
|
|
+ #####################################
|
|
|
+ expand("motif_analysis/{sample}_summit.fa", sample = SAMPLE_TO_REPLICATE.keys()),
|
|
|
+ expand("motif_analysis/{sample}/combined.meme", sample = SAMPLE_TO_REPLICATE.keys()) if config["motif"]["do"] else [],
|
|
|
+ #####################################
|
|
|
+ # Peak 注释
|
|
|
+ #####################################
|
|
|
+ expand("peak_annotation/{sample}_peaks_allhits.txt", sample = SAMPLE_TO_REPLICATE.keys()) if config["uropa"]["do"] else [],
|
|
|
+ expand("peak_annotation/{sample}_peaks_finalhits.txt", sample = SAMPLE_TO_REPLICATE.keys()) if config["uropa"]["do"] else [],
|
|
|
+ #####################################
|
|
|
+ # 定量分析
|
|
|
+ #####################################
|
|
|
+ "counts/merged_peaks.counts.matrix",
|
|
|
+ "counts/merged_peaks.TMM.CPM.matrix",
|
|
|
+ # 差异分析
|
|
|
+ expand("diff_peaks/{contrast}.{m}.DE_results", contrast=config["contrasts"], m=config["diff_peaks"]["method"]) if config["diff_peaks"]["do"] and config["contrasts"] else [],
|
|
|
+ expand("diff_peaks_annotation/{contrast}.bed6", contrast=config["contrasts"]) if config["diff_peaks"]["do"] and config["contrasts"] and config["uropa"]["do"] else [],
|
|
|
+ #####################################
|
|
|
+ # 质控结果
|
|
|
+ #####################################
|
|
|
+ # 测序数据质控表格
|
|
|
+ "quality_control/fastp_summary.tsv",
|
|
|
+ # 比对质控表格
|
|
|
+ "quality_control/estimate_read_filtering.tsv",
|
|
|
+ # cross correlation 质控表格
|
|
|
+ "quality_control/cross_correlation_summary.tsv" if config["cross_correlation"]["do"] else [],
|
|
|
+
|
|
|
+##########################################################
|
|
|
+# 使用fastp进行原始数据质控和过滤,自动判断 SE?PE?
|
|
|
+# raw_data => clean_data
|
|
|
+##########################################################
|
|
|
+rule fastp_quality_control:
|
|
|
+ input:
|
|
|
+ fastq=["raw_data/{replicate}_R1.fastq.gz", "raw_data/{replicate}_R2.fastq.gz"] if SEQ_TYPE == 'PE' else ["raw_data/{replicate}_R1.fastq.gz"]
|
|
|
+ output:
|
|
|
+ fastq=["clean_data/{replicate}_R1.fastq.gz", "clean_data/{replicate}_R2.fastq.gz"] if SEQ_TYPE == 'PE' else ["clean_data/{replicate}_R1.fastq.gz"],
|
|
|
+ html_report="clean_data/{replicate}_fastp.html",
|
|
|
+ json_report="clean_data/{replicate}_fastp.json"
|
|
|
+ log: "logs/{replicate}_fastp_quality_control.log"
|
|
|
+ threads: 3
|
|
|
+ singularity:
|
|
|
+ PEAKSNAKE_HOME + "/sifs/commonTools_20231218.sif"
|
|
|
+ params:
|
|
|
+ fastq="-i raw_data/{replicate}_R1.fastq.gz -o clean_data/{replicate}_R1.fastq.gz -I raw_data/{replicate}_R2.fastq.gz -O clean_data/{replicate}_R2.fastq.gz" if SEQ_TYPE == 'PE' else "-i raw_data/{replicate}_R1.fastq.gz -o clean_data/{replicate}_R1.fastq.gz",
|
|
|
+ fastp=config["fastp"]
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ fastp {params.fastq} --html {output.html_report} --json {output.json_report} --thread {threads} {params.fastp} 1>{log} 2>&1
|
|
|
+ """
|
|
|
+
|
|
|
+##########################################################
|
|
|
+# 根据基因组构建 bowtie2 index
|
|
|
+##########################################################
|
|
|
+rule bowtie2_build:
|
|
|
+ input:
|
|
|
+ genome = GENOME
|
|
|
+ output:
|
|
|
+ index_prefix = "ref/genome.1.bt2"
|
|
|
+ log: "logs/bowtie2_build.log"
|
|
|
+ singularity:
|
|
|
+ PEAKSNAKE_HOME + "/sifs/bowtie2_2.5.2.sif"
|
|
|
+ threads: 6
|
|
|
+ shell:
|
|
|
+ "bowtie2-build --threads {threads} {input.genome} ref/genome 1>{log} 2>&1"
|
|
|
+
|
|
|
+##########################################################
|
|
|
+# 使用 bowtie2 将测序数据比对到参考基因组
|
|
|
+# fastq => sam
|
|
|
+##########################################################
|
|
|
+rule bowtie2_align:
|
|
|
+ input:
|
|
|
+ fastq=["clean_data/{replicate}_R1.fastq.gz", "clean_data/{replicate}_R2.fastq.gz"] if SEQ_TYPE == 'PE' else ["clean_data/{replicate}_R1.fastq.gz"],
|
|
|
+ genome_index=config["bowtie2_index"] + ".1.bt2" if config["bowtie2_index"] else "ref/genome.1.bt2"
|
|
|
+ output:
|
|
|
+ sam="raw_bams/{replicate}.sam"
|
|
|
+ log: "logs/{replicate}_bowtie2_align.log"
|
|
|
+ singularity:
|
|
|
+ PEAKSNAKE_HOME + "/sifs/bowtie2_2.5.2.sif"
|
|
|
+ threads: 4
|
|
|
+ params:
|
|
|
+ genome_index=config["bowtie2_index"] if config["bowtie2_index"] else "ref/genome",
|
|
|
+ fastq="-1 clean_data/{replicate}_R1.fastq.gz -2 clean_data/{replicate}_R2.fastq.gz" if SEQ_TYPE == 'PE' else "-U clean_data/{replicate}_R1.fastq.gz",
|
|
|
+ bowtie2=config["bowtie2"] if config["bowtie2"] else ""
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ bowtie2 -p {threads} -x {params.genome_index} {params.fastq} -S {output.sam} {params.bowtie2} 1>{log} 2>&1
|
|
|
+ """
|
|
|
+
|
|
|
+rule sort_bam:
|
|
|
+ input:
|
|
|
+ sam="raw_bams/{replicate}.sam"
|
|
|
+ output:
|
|
|
+ sorted_bam="raw_bams/{replicate}_sorted.bam"
|
|
|
+ log: "logs/{replicate}_sort_bam.log"
|
|
|
+ singularity:
|
|
|
+ PEAKSNAKE_HOME + "/sifs/commonTools_20231218.sif"
|
|
|
+ threads: 4
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ samtools sort -@ {threads} -o {output.sorted_bam} {input.sam} 1>{log} 2>&1
|
|
|
+ samtools index {output.sorted_bam}
|
|
|
+ """
|
|
|
+
|
|
|
+rule sieve_alignment:
|
|
|
+ input:
|
|
|
+ bam="raw_bams/{replicate}_sorted.bam"
|
|
|
+ output:
|
|
|
+ bam="clean_bams/{replicate}_final.bam"
|
|
|
+ log: "logs/{replicate}_sieve_alignment.log"
|
|
|
+ threads: 4
|
|
|
+ singularity:
|
|
|
+ PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif"
|
|
|
+ params:
|
|
|
+ peaksnake_home=PEAKSNAKE_HOME,
|
|
|
+ minMappingQuality=config["alignmentSieve"]["minMappingQuality"],
|
|
|
+ blacklist="--blackListFileName " + config["alignmentSieve"]["blacklist"] if config["alignmentSieve"]["blacklist"] else [],
|
|
|
+ extra=config["alignmentSieve"]["extra"] if config["alignmentSieve"]["extra"] else ""
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ alignmentSieve --numberOfProcessors {threads} --bam {input.bam} --outFile {output.bam} --filterMetrics {log} --ignoreDuplicates --minMappingQuality {params.minMappingQuality} --samFlagExclude 260 {params.blacklist} {params.extra} 1>{log} 2>&1
|
|
|
+
|
|
|
+ samtools index {output.bam}
|
|
|
+ """
|
|
|
+
|
|
|
+rule estimate_read_filtering:
|
|
|
+ input:
|
|
|
+ bams=expand("raw_bams/{replicate}_sorted.bam", replicate=REPLICATES + INPUTS)
|
|
|
+ output:
|
|
|
+ stat="quality_control/estimate_read_filtering.tsv"
|
|
|
+ threads: 4
|
|
|
+ singularity:
|
|
|
+ PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif"
|
|
|
+ log: "logs/estimate_read_filtering.log"
|
|
|
+ params:
|
|
|
+ peaksnake_home=PEAKSNAKE_HOME,
|
|
|
+ minMappingQuality=config["alignmentSieve"]["minMappingQuality"],
|
|
|
+ blacklist="--blackListFileName " + config["alignmentSieve"]["blacklist"] if config["alignmentSieve"]["blacklist"] else "",
|
|
|
+ extra=config["alignmentSieve"]["extra"] if config["alignmentSieve"]["extra"] else "",
|
|
|
+ sampleLabels=REPLICATES + INPUTS
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ estimateReadFiltering \
|
|
|
+ --numberOfProcessors {threads} \
|
|
|
+ --bam {input.bams} \
|
|
|
+ --sampleLabels {params.sampleLabels} \
|
|
|
+ --outFile {output.stat} \
|
|
|
+ --ignoreDuplicates \
|
|
|
+ --minMappingQuality {params.minMappingQuality} \
|
|
|
+ --samFlagExclude 260 \
|
|
|
+ --blackListFileName ref/blacklist.bed \
|
|
|
+ {params.extra} 1>{log} 2>&1
|
|
|
+ """
|
|
|
+
|
|
|
+##########################################################
|
|
|
+# DeepTools 绘图
|
|
|
+##########################################################
|
|
|
+rule convert_bam_to_bigwig:
|
|
|
+ input:
|
|
|
+ bam="clean_bams/{replicate}_final.bam"
|
|
|
+ output:
|
|
|
+ bw="clean_bams/{replicate}.bw"
|
|
|
+ log: "logs/{replicate}_convert_bam_to_bigwig.log"
|
|
|
+ singularity:
|
|
|
+ PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif"
|
|
|
+ threads: 2
|
|
|
+ params:
|
|
|
+ gsize=config["gsize"],
|
|
|
+ bamCoverage = config["bamCoverage"] if config["bamCoverage"] else ""
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ bamCoverage -p {threads} --bam {input.bam} -o {output.bw} --effectiveGenomeSize {params.gsize} {params.bamCoverage} 1>{log} 2>&1
|
|
|
+ """
|
|
|
+
|
|
|
+rule summarize_multiple_bigwigs:
|
|
|
+ input:
|
|
|
+ bws=expand("clean_bams/{replicate}.bw", replicate=REPLICATES + INPUTS),
|
|
|
+ tss_tes_bed=config["tss_tes_bed"]
|
|
|
+ output:
|
|
|
+ "clean_bams/bw_summary.gz"
|
|
|
+ params:
|
|
|
+ labels=REPLICATES + INPUTS
|
|
|
+ log: "logs/summarize_multiple_bigwigs.log"
|
|
|
+ singularity:
|
|
|
+ PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif"
|
|
|
+ threads: 10
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ multiBigwigSummary BED-file \
|
|
|
+ --bwfiles {input.bws} \
|
|
|
+ --labels {params.labels} \
|
|
|
+ --BED {input.tss_tes_bed} \
|
|
|
+ -o {output} -p {threads} 1>{log} 2>&1
|
|
|
+ """
|
|
|
+
|
|
|
+rule generate_correlation_plots:
|
|
|
+ input:
|
|
|
+ "clean_bams/bw_summary.gz"
|
|
|
+ output:
|
|
|
+ pdf="deeptools/sample_correlation.pdf",
|
|
|
+ tab="deeptools/sample_correlation.tab"
|
|
|
+ log: "logs/generate_correlation_plots.log"
|
|
|
+ singularity:
|
|
|
+ PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif"
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ plotCorrelation -in {input} -o {output.pdf} \
|
|
|
+ --corMethod pearson --whatToPlot heatmap \
|
|
|
+ -min 0.5 \
|
|
|
+ --plotTitle "Pearson Correlation of Samples" \
|
|
|
+ --outFileCorMatrix {output.tab} 1>{log} 2>&1
|
|
|
+ """
|
|
|
+
|
|
|
+rule plot_heatmap_reference_point:
|
|
|
+ input:
|
|
|
+ bws=expand("clean_bams/{replicate}.bw", replicate=REPLICATES + INPUTS),
|
|
|
+ tss_tes_bed=config["tss_tes_bed"],
|
|
|
+ tss_tes_shuffle_bed=config["tss_tes_shuffle_bed"]
|
|
|
+ output:
|
|
|
+ tss_matrix="deeptools/tss_matrix.gz",
|
|
|
+ tss_heatmap="deeptools/tss_heatmap.pdf"
|
|
|
+ params:
|
|
|
+ labels=REPLICATES + INPUTS
|
|
|
+ log: "logs/plot_heatmap_reference_point.log"
|
|
|
+ singularity:
|
|
|
+ PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif"
|
|
|
+ threads: 10
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ computeMatrix reference-point \
|
|
|
+ -S {input.bws} \
|
|
|
+ --samplesLabel {params.labels} \
|
|
|
+ -R {input.tss_tes_bed} {input.tss_tes_shuffle_bed} \
|
|
|
+ --referencePoint TSS \
|
|
|
+ -b 5000 -a 5000 \
|
|
|
+ --binSize 50 \
|
|
|
+ -o {output.tss_matrix} \
|
|
|
+ -p {threads} 1>{log} 2>&1
|
|
|
+
|
|
|
+ plotHeatmap -m {output.tss_matrix} -o {output.tss_heatmap} --missingDataColor 0.5 1>>{log} 2>&1
|
|
|
+ """
|
|
|
+
|
|
|
+rule plot_heatmap_scale_regions:
|
|
|
+ input:
|
|
|
+ bws=expand("clean_bams/{replicate}.bw", replicate=REPLICATES + INPUTS),
|
|
|
+ tss_tes_bed=config["tss_tes_bed"],
|
|
|
+ tss_tes_shuffle_bed=config["tss_tes_shuffle_bed"]
|
|
|
+ output:
|
|
|
+ tss_tes_matrix="deeptools/tss_tes_matrix.gz",
|
|
|
+ tss_tes_heatmap="deeptools/tss_tes_heatmap.pdf"
|
|
|
+ params:
|
|
|
+ labels=REPLICATES + INPUTS
|
|
|
+ log: "logs/plot_heatmap_scale_regions.log"
|
|
|
+ singularity:
|
|
|
+ PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif"
|
|
|
+ threads: 10
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ computeMatrix scale-regions \
|
|
|
+ -S {input.bws} \
|
|
|
+ --samplesLabel {params.labels} \
|
|
|
+ -R {input.tss_tes_bed} {input.tss_tes_shuffle_bed} \
|
|
|
+ --regionBodyLength 4000 \
|
|
|
+ -b 2000 -a 2000 \
|
|
|
+ --binSize 50 \
|
|
|
+ -o {output.tss_tes_matrix} \
|
|
|
+ -p {threads} 1>{log} 2>&1
|
|
|
+
|
|
|
+ plotHeatmap -m {output.tss_tes_matrix} -o {output.tss_tes_heatmap} --missingDataColor 0.5 1>>{log} 2>&1
|
|
|
+ """
|
|
|
+
|
|
|
+##########################################################
|
|
|
+# 对每个 replicate:input call peak
|
|
|
+##########################################################
|
|
|
+# 规则:MACS3 call peak
|
|
|
+rule callpeak_with_macs3:
|
|
|
+ input:
|
|
|
+ sorted_ip_bam="clean_bams/{replicate}_final.bam",
|
|
|
+ sorted_input_bam=lambda wildcards: f"clean_bams/{config['samples'][wildcards.replicate]}_final.bam" if config['samples'][wildcards.replicate] else []
|
|
|
+ output:
|
|
|
+ Peak="raw_peaks/{replicate}_peaks." + PEAK_TYPE + "Peak",
|
|
|
+ cutoff_analysis_txt="raw_peaks/{replicate}_cutoff_analysis.txt"
|
|
|
+ log: "logs/{replicate}_callpeak_with_macs3.log"
|
|
|
+ singularity:
|
|
|
+ PEAKSNAKE_HOME + "/sifs/macs3_idr_20231218.sif"
|
|
|
+ threads: 1
|
|
|
+ params:
|
|
|
+ control=lambda wildcards: f"-c clean_bams/{config['samples'][wildcards.replicate]}_final.bam" if config['samples'][wildcards.replicate] else "",
|
|
|
+ format="BAMPE" if SEQ_TYPE == 'PE' else "BAM",
|
|
|
+ gsize=config["gsize"],
|
|
|
+ PEAK_TYPE="--broad" if PEAK_TYPE == "broad" else "",
|
|
|
+ extra = config["macs3"]["extra"] if config["macs3"]["extra"] else ""
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ macs3 callpeak -g {params.gsize} -t {input.sorted_ip_bam} {params.control} --name raw_peaks/{wildcards.replicate} --format {params.format} --keep-dup all --qvalue 0.075 --cutoff-analysis {params.PEAK_TYPE} {params.extra} 1>{log} 2>&1
|
|
|
+ """
|
|
|
+
|
|
|
+rule plot_macs_cutoff_analysis:
|
|
|
+ input:
|
|
|
+ cutoff_analysis_txt="raw_peaks/{replicate}_cutoff_analysis.txt"
|
|
|
+ output:
|
|
|
+ cutoff_analysis_pdf="raw_peaks/{replicate}_cutoff_analysis.pdf"
|
|
|
+ log: "logs/{replicate}_plot_macs_cutoff_analysis.log"
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ python {PEAKSNAKE_HOME}/scripts/plot_macs_cutoff_analysis.py {input.cutoff_analysis_txt} {output} 1>{log} 2>&1
|
|
|
+ """
|
|
|
+
|
|
|
+rule filter_peaks_by_qscore:
|
|
|
+ input:
|
|
|
+ Peak="raw_peaks/{replicate}_peaks." + PEAK_TYPE + "Peak"
|
|
|
+ output:
|
|
|
+ Peak="clean_peaks/cutoff/{replicate}_peaks." + PEAK_TYPE + "Peak"
|
|
|
+ params:
|
|
|
+ qscore=config["macs3"]["qscore"]
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ awk '$9>{params.qscore}' {input.Peak} > {output.Peak}
|
|
|
+ """
|
|
|
+
|
|
|
+rule select_peaks_norep:
|
|
|
+ input:
|
|
|
+ Peak="clean_peaks/cutoff/{sample}_rep1_peaks." + PEAK_TYPE + "Peak"
|
|
|
+ output:
|
|
|
+ Peak="clean_peaks/norep/{sample}_peaks." + PEAK_TYPE + "Peak"
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ cp {input.Peak} {output.Peak}
|
|
|
+ """
|
|
|
+
|
|
|
+rule select_peaks_by_intersect:
|
|
|
+ input:
|
|
|
+ Peak=lambda wildcards: expand("clean_peaks/cutoff/" + wildcards.sample + "_{rep}_peaks." + PEAK_TYPE + "Peak", rep=SAMPLE_TO_REPLICATE[wildcards.sample])
|
|
|
+ output:
|
|
|
+ Peak="clean_peaks/intersect/{sample}_peaks." + PEAK_TYPE + "Peak"
|
|
|
+ params:
|
|
|
+ min_overlap=config['intersect']['min_overlap']
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ # 检查输入文件的数量
|
|
|
+ num_peaks=$(echo {input.Peak} | wc -w)
|
|
|
+
|
|
|
+ # 如果只有一个输入文件
|
|
|
+ if [ "$num_peaks" -eq 1 ]; then
|
|
|
+ cp {input.Peak[0]} {output.Peak}
|
|
|
+ else
|
|
|
+ # 复制第一个输入文件到临时文件
|
|
|
+ cp {input.Peak[0]} {wildcards.sample}.temp.bed
|
|
|
+
|
|
|
+ # 使用除第一个之外的所有输入文件
|
|
|
+ echo {input.Peak} | tr ' ' '\\n' | awk 'NR>1' | while read bed; do
|
|
|
+ bedtools intersect -f {params.min_overlap} -r -a {wildcards.sample}.temp.bed -b $bed -wa > {wildcards.sample}.temp2.bed
|
|
|
+ mv {wildcards.sample}.temp2.bed {wildcards.sample}.temp.bed
|
|
|
+ done
|
|
|
+
|
|
|
+ # 创建一个中间的 peak list 文件
|
|
|
+ cut -f 4 {wildcards.sample}.temp.bed > {wildcards.sample}.peak_lst
|
|
|
+
|
|
|
+ # 使用中间的 peak list 文件和第一个输入文件生成最终输出
|
|
|
+ awk 'NR==FNR {{a[$1]; next}} $4 in a' {wildcards.sample}.peak_lst {input.Peak[0]} > {output.Peak}
|
|
|
+
|
|
|
+ # 清理临时文件
|
|
|
+ rm {wildcards.sample}.temp.bed {wildcards.sample}.peak_lst
|
|
|
+ fi
|
|
|
+ """
|
|
|
+
|
|
|
+rule select_peaks_by_idr:
|
|
|
+ input:
|
|
|
+ rep1_peaks="raw_peaks/{sample}_rep1_peaks." + PEAK_TYPE + "Peak",
|
|
|
+ rep2_peaks="raw_peaks/{sample}_rep2_peaks." + PEAK_TYPE + "Peak"
|
|
|
+ output:
|
|
|
+ true_rep_idr="clean_peaks/idr/{sample}_true_rep_idr.txt",
|
|
|
+ idr_peaks="clean_peaks/idr/{sample}_peaks." + PEAK_TYPE + "Peak"
|
|
|
+ log: "logs/{sample}_select_peaks_by_idr.log"
|
|
|
+ singularity:
|
|
|
+ PEAKSNAKE_HOME + "/sifs/macs3_idr_20231218.sif"
|
|
|
+ threads: 1
|
|
|
+ params:
|
|
|
+ PEAK_TYPE=PEAK_TYPE,
|
|
|
+ idr_threshold=config["idr"]["idr_threshold"]
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ idr --samples {input.rep1_peaks} {input.rep2_peaks} --idr-threshold {params.idr_threshold} --output-file {output.true_rep_idr} --plot --input-file-type {PEAK_TYPE}Peak --rank p.value 1>{log} 2>&1
|
|
|
+
|
|
|
+ awk -v OFS="\\t" 'BEGIN {{FS=OFS}} {{ $4=$1":"$2"_"$3; print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10 }}' {output.true_rep_idr} | sort -k1,1 -k2,2n -k3,3n > {output.idr_peaks}
|
|
|
+ """
|
|
|
+
|
|
|
+##########################################################
|
|
|
+# Motif 分析
|
|
|
+##########################################################
|
|
|
+rule extract_peak_summit_fasta:
|
|
|
+ input:
|
|
|
+ Peak="clean_peaks/" + PEAK_SELECTION + "/{sample}_peaks." + PEAK_TYPE + "Peak"
|
|
|
+ output:
|
|
|
+ summit_fa="motif_analysis/{sample}_summit.fa"
|
|
|
+ params:
|
|
|
+ top_n=config["motif"]["top_n"],
|
|
|
+ summit_flank=config["motif"]["summit_flank"],
|
|
|
+ genome=config["genome"]
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ set +o pipefail;
|
|
|
+ sort -k8,8nr {input.Peak} | head -n {params.top_n} | awk -v OFS='\\t' '{{print $1,$2+$10-{params.summit_flank},$2+$10+{params.summit_flank} + 1}}' | awk '$2>=0' | bedtools getfasta -fi {params.genome} -bed - > {output.summit_fa}
|
|
|
+ """
|
|
|
+
|
|
|
+rule motif_discovery:
|
|
|
+ input:
|
|
|
+ summit_fa="motif_analysis/{sample}_summit.fa"
|
|
|
+ output:
|
|
|
+ "motif_analysis/{sample}/combined.meme"
|
|
|
+ log: "logs/{sample}_motif_discovery.log"
|
|
|
+ params:
|
|
|
+ motif_databases=config['motif']['motif_databases'],
|
|
|
+ outdir="motif_analysis/{sample}",
|
|
|
+ extra=config['motif']['extra'] if config['motif']['extra'] else ""
|
|
|
+ singularity:
|
|
|
+ PEAKSNAKE_HOME + "/sifs/memesuite_latest.sif"
|
|
|
+ threads: 4
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ meme-chip -meme-p {threads} -db {params.motif_databases} -oc {params.outdir} {params.extra} {input.summit_fa} 1>{log} 2>&1
|
|
|
+ """
|
|
|
+
|
|
|
+##########################################################
|
|
|
+# Peak 注释
|
|
|
+##########################################################
|
|
|
+rule peak_annotation_with_uropa:
|
|
|
+ input:
|
|
|
+ Peak="clean_peaks/" + PEAK_SELECTION + "/{sample}_peaks." + PEAK_TYPE + "Peak",
|
|
|
+ gtf=config["gtf"]
|
|
|
+ output:
|
|
|
+ allhits="peak_annotation/{sample}_peaks_allhits.txt",
|
|
|
+ finalhits="peak_annotation/{sample}_peaks_finalhits.txt"
|
|
|
+ log: "logs/{sample}_peak_annotation_with_uropa.log"
|
|
|
+ params:
|
|
|
+ feature_anchor=config["uropa"]["feature_anchor"],
|
|
|
+ distance=config["uropa"]["distance"],
|
|
|
+ relative_location=config["uropa"]["relative_location"]
|
|
|
+ singularity:
|
|
|
+ PEAKSNAKE_HOME + "/sifs/uropa_4.0.3--pyhdfd78af_0.sif"
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ uropa --bed {input.Peak} --gtf {input.gtf} --feature gene --feature-anchor {params.feature_anchor} --distance {params.distance} --relative-location {params.relative_location} --outdir peak_annotation 1>{log} 2>&1
|
|
|
+ """
|
|
|
+
|
|
|
+##########################################################
|
|
|
+# 生成所有 samples 共识 peaks
|
|
|
+##########################################################
|
|
|
+rule merge_peaks:
|
|
|
+ input:
|
|
|
+ sample_peaks=expand("clean_peaks/" + PEAK_SELECTION + "/{sample}_peaks." + PEAK_TYPE + "Peak", sample=SAMPLE_TO_REPLICATE.keys())
|
|
|
+ output:
|
|
|
+ merged_peaks_bed="clean_peaks/merge/merged_peaks.bed",
|
|
|
+ merged_peaks_saf="clean_peaks/merge/merged_peaks.saf"
|
|
|
+ singularity:
|
|
|
+ PEAKSNAKE_HOME + "/sifs/commonTools_20231218.sif"
|
|
|
+ params:
|
|
|
+ fai=config['genome'] + ".fai"
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ mkdir -p clean_peaks/merge
|
|
|
+ cat {input.sample_peaks} > clean_peaks/merge/cat_peaks.bed
|
|
|
+ bedtools sort -i clean_peaks/merge/cat_peaks.bed -g {params.fai} > clean_peaks/merge/cat_sorted_peaks.bed
|
|
|
+ bedtools merge -i clean_peaks/merge/cat_sorted_peaks.bed > {output.merged_peaks_bed}
|
|
|
+ awk 'OFS="\t" {{print $1":"$2+1"-"$3, $1, $2+1, $3, "."}}' {output.merged_peaks_bed} > {output.merged_peaks_saf}
|
|
|
+ """
|
|
|
+
|
|
|
+##########################################################
|
|
|
+# 使用 feature counts 计算 replicate peak counts
|
|
|
+##########################################################
|
|
|
+rule run_feature_counts:
|
|
|
+ input:
|
|
|
+ bam="clean_bams/{replicate}_final.bam",
|
|
|
+ merged_peaks_saf="clean_peaks/merge/merged_peaks.saf"
|
|
|
+ output:
|
|
|
+ counts="counts/{replicate}.count",
|
|
|
+ stat="counts/{replicate}.log"
|
|
|
+ log: "logs/{replicate}_run_feature_counts.log"
|
|
|
+ params:
|
|
|
+ peaksnake_home=PEAKSNAKE_HOME,
|
|
|
+ isPairedEnd="TRUE" if SEQ_TYPE == 'PE' else "FALSE"
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ Rscript {params.peaksnake_home}/software/RunFeatureCounts/run-featurecounts.R -b {input.bam} --saf {input.merged_peaks_saf} --isPairedEnd {params.isPairedEnd} -o counts/{wildcards.replicate} 1>{log} 2>&1
|
|
|
+ """
|
|
|
+
|
|
|
+##########################################################
|
|
|
+# 合并生成 replicate counts 矩阵
|
|
|
+##########################################################
|
|
|
+rule create_count_matrix:
|
|
|
+ input:
|
|
|
+ expand("counts/{replicate}.count", replicate=REPLICATES)
|
|
|
+ output:
|
|
|
+ counts_matrix="counts/merged_peaks.counts.matrix",
|
|
|
+ cpm_matrix="counts/merged_peaks.TMM.CPM.matrix"
|
|
|
+ log: "logs/create_count_matrix.log"
|
|
|
+ params:
|
|
|
+ peaksnake_home=PEAKSNAKE_HOME
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ ls {input} >counts/count.fofn
|
|
|
+ perl {params.peaksnake_home}/software/RunFeatureCounts/abundance_estimates_to_matrix.pl --est_method featureCounts --quant_files counts/count.fofn --out_prefix counts/merged_peaks 1>{log} 2>&1
|
|
|
+ """
|
|
|
+
|
|
|
+##########################################################
|
|
|
+# 使用 DESeq2/ edgeR 进行差异分析
|
|
|
+##########################################################
|
|
|
+
|
|
|
+if config["contrasts"]:
|
|
|
+ # 生成 samples.txt
|
|
|
+ with open('samples.txt', 'w') as file:
|
|
|
+ for ip in REPLICATES:
|
|
|
+ # 提取 sample name
|
|
|
+ sample_name = '_'.join(ip.split('_')[:2])
|
|
|
+ file.write(f"{sample_name}\t{ip}\n")
|
|
|
+
|
|
|
+ # 生成 contrasts.txt
|
|
|
+ with open("contrasts.txt", "w") as file:
|
|
|
+ file.write("\n".join([" ".join(item.split("_vs_")) for item in config["contrasts"]]))
|
|
|
+
|
|
|
+rule run_deseq2:
|
|
|
+ input:
|
|
|
+ "counts/merged_peaks.counts.matrix"
|
|
|
+ output:
|
|
|
+ expand("diff_peaks/{contrast}.DESeq2.DE_results", contrast=config["contrasts"])
|
|
|
+ log: "logs/run_deseq2.log"
|
|
|
+ singularity:
|
|
|
+ PEAKSNAKE_HOME + "/sifs/trinity_20231218.sif"
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ run_DE_analysis.pl -m {input} --method DESeq2 -s samples.txt --contrasts contrasts.txt -o diff_peaks 1>{log} 2>&1
|
|
|
+
|
|
|
+ cd diff_peaks/
|
|
|
+ rm *count_matrix *.MA_n_Volcano.pdf *.Rscript
|
|
|
+ for file in merged_peaks.counts.matrix.*.DE_results; do
|
|
|
+ echo -e "PeakID\tsampleA\tsampleB\tFoldChange\tPValue\tPadj" >"${{file#merged_peaks.counts.matrix.}}"
|
|
|
+ grep -v "^sampleA" $file | awk 'BEGIN {{OFS="\t"}} {{ $7 = exp(log(2) * $7); print $1,$2,$3,$7,$10,$11}}' >> "${{file#merged_peaks.counts.matrix.}}"
|
|
|
+ rm $file
|
|
|
+ done
|
|
|
+ cd ..
|
|
|
+ """
|
|
|
+
|
|
|
+rule run_edgeR:
|
|
|
+ input:
|
|
|
+ "counts/merged_peaks.counts.matrix"
|
|
|
+ output:
|
|
|
+ expand("diff_peaks/{contrast}.edgeR.DE_results", contrast=config["contrasts"])
|
|
|
+ log: "logs/run_edgeR.log"
|
|
|
+ singularity:
|
|
|
+ PEAKSNAKE_HOME + "/sifs/trinity_20231218.sif"
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ run_DE_analysis.pl -m {input} --method edgeR -s samples.txt --contrasts contrasts.txt -o diff_peaks 1>{log} 2>&1
|
|
|
+
|
|
|
+ cd diff_peaks/
|
|
|
+ rm *count_matrix *.MA_n_Volcano.pdf *.Rscript
|
|
|
+ for file in merged_peaks.counts.matrix.*.DE_results; do
|
|
|
+ echo -e "PeakID\tsampleA\tsampleB\tFoldChange\tPValue\tPadj" >"${{file#merged_peaks.counts.matrix.}}"
|
|
|
+ grep -v "^sampleA" $file | awk 'BEGIN {{OFS="\t"}} {{ $4 = exp(log(2) * $4); print $1,$2,$3,$4,$6,$7}}' >> "${{file#merged_peaks.counts.matrix.}}"
|
|
|
+ rm $file
|
|
|
+ done
|
|
|
+ cd ..
|
|
|
+ """
|
|
|
+
|
|
|
+rule diff_peaks_annotation_with_uropa:
|
|
|
+ input:
|
|
|
+ diff_peaks="diff_peaks/{contrast}.edgeR.DE_results",
|
|
|
+ gtf=config["gtf"]
|
|
|
+ output:
|
|
|
+ diff_peaks_bed="diff_peaks_annotation/{contrast}.bed6",
|
|
|
+ allhits="diff_peaks_annotation/{contrast}_allhits.txt",
|
|
|
+ finalhits="diff_peaks_annotation/{contrast}_finalhits.txt"
|
|
|
+ log: "logs/{contrast}_diff_peaks_annotation_with_uropa.log"
|
|
|
+ params:
|
|
|
+ diff_peaks_padj=config["diff_peaks"]["padj"],
|
|
|
+ feature_anchor=config["uropa"]["feature_anchor"],
|
|
|
+ distance=config["uropa"]["distance"],
|
|
|
+ relative_location=config["uropa"]["relative_location"]
|
|
|
+ singularity:
|
|
|
+ PEAKSNAKE_HOME + "/sifs/uropa_4.0.3--pyhdfd78af_0.sif"
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ awk 'BEGIN {{FS="\\t"; OFS="\t"}} NR>1 && $6 < {params.diff_peaks_padj} {{split($1, arr, "[:-]"); print arr[1], arr[2]-1, arr[3], $1, $4, "."}}' {input.diff_peaks} > {output.diff_peaks_bed}
|
|
|
+
|
|
|
+ uropa --bed {output.diff_peaks_bed} --gtf {input.gtf} --feature gene --feature-anchor {params.feature_anchor} --distance {params.distance} --relative-location {params.relative_location} --outdir diff_peaks_annotation 1>{log} 2>&1
|
|
|
+ """
|
|
|
+
|
|
|
+##########################################################
|
|
|
+# 合并fastp质控结果生成表格
|
|
|
+##########################################################
|
|
|
+rule combine_fastp_reports:
|
|
|
+ input:
|
|
|
+ expand("clean_data/{replicate}_fastp.json", replicate=REPLICATES + INPUTS)
|
|
|
+ output:
|
|
|
+ "quality_control/fastp_summary.tsv"
|
|
|
+ log: "logs/combine_fastp_reports.log"
|
|
|
+ params:
|
|
|
+ peaksnake_home=PEAKSNAKE_HOME
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ Rscript {params.peaksnake_home}/scripts/combine_fastp_report.R --input {input} --output quality_control/ 1>{log} 2>&1
|
|
|
+ """
|
|
|
+
|
|
|
+##########################################################
|
|
|
+# cross correlation
|
|
|
+##########################################################
|
|
|
+rule cross_correlation:
|
|
|
+ input:
|
|
|
+ bam="clean_bams/{replicate}_final.bam"
|
|
|
+ output:
|
|
|
+ pdf="quality_control/cross_correlation/{replicate}.pdf",
|
|
|
+ tsv="quality_control/cross_correlation/{replicate}.tsv"
|
|
|
+ log: "logs/{replicate}_cross_correlation.log"
|
|
|
+ singularity:
|
|
|
+ PEAKSNAKE_HOME + "/sifs/phantompeakqualtools_20231218.sif"
|
|
|
+ params:
|
|
|
+ spp=config['cross_correlation']['spp'] if config['cross_correlation']['spp'] else ""
|
|
|
+ threads: 4
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ run_spp.R -p={threads} -c={input.bam} -savp={output.pdf} -out={output.tsv} -rf {params.spp} 1>{log} 2>&1
|
|
|
+ """
|
|
|
+
|
|
|
+rule cross_correlation_summary:
|
|
|
+ input:
|
|
|
+ expand("quality_control/cross_correlation/{replicate}.tsv", replicate=REPLICATES + INPUTS)
|
|
|
+ output:
|
|
|
+ "quality_control/cross_correlation_summary.tsv"
|
|
|
+ shell:
|
|
|
+ """
|
|
|
+ echo -e "Sample\\tTotalReadsFiltered\\tFragmentLength\\tCrossCorrelation\\tPhantomPeakLocation\\tPhantomPeakCorrelation\\tMinimumCrossCorrelationShift\\tMinimumCrossCorrelationValue\\tNSC\\tRSC\\tPhantomPeakQualityTag" >{output}
|
|
|
+ cat {input} | sort -k 1 >>{output}
|
|
|
+ """
|