import os import yaml import pandas as pd configfile: "config.yaml" ########################################################## # 全局变量和样本信息 ########################################################## ## 软件主目录 PEAKSNAKE_HOME = config["PEAKSNAKE_HOME"] if config["PEAKSNAKE_HOME"] else os.getcwd() ## 关键参数 PEAK_TYPE = config["peak_type"] SEQ_TYPE = config["seq_type"] PEAK_SELECTION = config["peak_selection"] GENOME = config["genome"] GTF = config["gtf"] ## 样本信息变量 # 字典:REPLICATE to INPUT REPLICATE_TO_INPUT = {k: config['samples'][k] for k in sorted(config['samples'])} # 列表:所有 REPLICATES REPLICATES = sorted(list(set(REPLICATE_TO_INPUT.keys()))) # 列表:所有 INPUTS INPUTS = sorted(list(set(REPLICATE_TO_INPUT.values()))) INPUTS = [] if INPUTS == [None] else INPUTS # 字典: SAMPLE 与 rep1 rep2 rep2 对应关系 SAMPLE_TO_REPLICATE = {} for s in REPLICATES: name, rep = '_'.join(s.split('_')[:-1]), s.split('_')[-1] SAMPLE_TO_REPLICATE.setdefault(name, []).append(rep) SAMPLE_TO_REPLICATE = {k: sorted(v) for k, v in SAMPLE_TO_REPLICATE.items()} ## 生成样本信息表 with open("sample_sheet.csv", 'w') as f: f.write("SampleID,ControlID,Tissue,Factor,Condition,Treatment,Replicate,bamReads,Peaks,bamControl,PeakCaller\n") for sample, control in REPLICATE_TO_INPUT.items(): sample_parts = sample.split('_') factor = sample_parts[0] # 提取样本ID中的Factor tissue = "NA" treatment = sample_parts[1] # 在这个例子中,tissue和condition是相同的 condition = factor + "_" + treatment replicate = sample_parts[2].replace("rep", "") # 将"rep1"和"rep2"转换为"1"和"2" if control: control_parts = control.split('_') control_id = "_".join(control_parts[:2]) # 构建 ControlID bamControl = f"clean_bams/{control}_final.bam" else: control_id = "NA" bamControl = "NA" bamReads = f"clean_bams/{sample}_final.bam" peaks = f"clean_peaks/cutoff/{sample}_peaks.{PEAK_TYPE}Peak" f.write(f"{sample},{control_id},{tissue},{factor},{condition},{treatment},{replicate},{bamReads},{peaks},{bamControl},bed\n") ########################################################## # rule all: 最终想要生成的文件 ########################################################## def get_motifs_from_meme(meme_file): motifs = [] with open(meme_file, 'r') as file: for line in file: if line.startswith("MOTIF"): parts = line.split() # 检查是否有三列,且第三列不是空字符串 if len(parts) == 3 and parts[2]: motif_name = parts[2] + "_" + parts[1] # 第三列和第二列的组合 # 检查是否有两列或第三列为空 elif len(parts) >= 2: motif_name = "_" + parts[1] # 只使用第二列 motifs.append(motif_name) return motifs rule all: input: ##################################### # 从 fastq 到 peaks ##################################### # 测序数据质控表格 "quality_control/fastp_summary.tsv", # 最终比对结果 expand("clean_bams/{replicate}_final.bam", replicate = REPLICATES + INPUTS), # 比对质控表格 "quality_control/estimate_read_filtering.tsv", # cross correlation 质控表格 "quality_control/cross_correlation_summary.tsv" if config["cross_correlation"]["do"] else [], # bw 文件, deeptools 可视化 expand("clean_bams/{replicate}.bw", replicate = REPLICATES + INPUTS), "deeptools/sample_correlation.pdf" if len(REPLICATES) > 2 else [], "deeptools/tss_heatmap.pdf", "deeptools/tss_tes_heatmap.pdf", # raw peak 结果 expand("raw_peaks/{replicate}_peaks.{PEAK_TYPE}Peak", replicate = REPLICATES, PEAK_TYPE = PEAK_TYPE), # cutoff analysis expand("raw_peaks/{replicate}_cutoff_analysis.pdf", replicate = REPLICATES), # clean peak 结果 expand("clean_peaks/cutoff/{replicate}_peaks.{PEAK_TYPE}Peak", replicate = REPLICATES, PEAK_TYPE = PEAK_TYPE), # 通过 intersect or idr 进行 peak 筛选 expand("clean_peaks/{m}/{sample}_peaks.{PEAK_TYPE}Peak", sample=SAMPLE_TO_REPLICATE.keys(), m = PEAK_SELECTION, PEAK_TYPE = PEAK_TYPE), # 提取序列 expand("clean_peaks/{m}/{sample}_peaks.fa", sample=SAMPLE_TO_REPLICATE.keys(), m = PEAK_SELECTION), # 所有样本共识peaks "clean_peaks/merge/merged_peaks.bed", ##################################### # 定量分析 ##################################### "counts/merged_peaks.counts.matrix", "counts/merged_peaks.TMM.CPM.matrix", ##################################### # Peak 注释 ##################################### # 对单个样本 peaks 注释 expand("peak_annotation/{sample}_peaks_{hit}.txt", sample = SAMPLE_TO_REPLICATE.keys(), hit = ["allhits", "finalhits"]) if config["uropa"]["do"] else [], # 对 merge 之后的 peaks 注释 expand("peak_annotation/merged_peaks_{hit}.txt", hit = ["allhits", "finalhits"]) if config["uropa"]["do"] else [], ##################################### # 差异分析 ##################################### expand("diff_peaks/{contrast}.{m}.DE_results", contrast=config["contrasts"], m=config["diff_peaks"]["method"]) if config["diff_peaks"]["do"] and config["contrasts"] else [], ##################################### # 足迹分析 ##################################### "footprinting/bindetect_results.txt" if config["footprint"]["do"] else [] ########################################################## # 使用fastp进行原始数据质控和过滤,自动判断 SE?PE? # raw_data => clean_data ########################################################## rule fastp_quality_control: input: fastq=["raw_data/{replicate}_R1.fastq.gz", "raw_data/{replicate}_R2.fastq.gz"] if SEQ_TYPE == 'PE' else ["raw_data/{replicate}_R1.fastq.gz"] output: fastq=["clean_data/{replicate}_R1.fastq.gz", "clean_data/{replicate}_R2.fastq.gz"] if SEQ_TYPE == 'PE' else ["clean_data/{replicate}_R1.fastq.gz"], html_report="clean_data/{replicate}_fastp.html", json_report="clean_data/{replicate}_fastp.json" log: "logs/{replicate}_fastp_quality_control.log" threads: 3 singularity: PEAKSNAKE_HOME + "/sifs/commonTools_20231218.sif" params: fastq="-i raw_data/{replicate}_R1.fastq.gz -o clean_data/{replicate}_R1.fastq.gz -I raw_data/{replicate}_R2.fastq.gz -O clean_data/{replicate}_R2.fastq.gz" if SEQ_TYPE == 'PE' else "-i raw_data/{replicate}_R1.fastq.gz -o clean_data/{replicate}_R1.fastq.gz", fastp=config["fastp"] shell: """ fastp {params.fastq} --html {output.html_report} --json {output.json_report} --thread {threads} {params.fastp} 1>{log} 2>&1 """ ########################################################## # 根据基因组构建 bowtie2 index ########################################################## rule bowtie2_build: input: genome = GENOME output: index_prefix = "ref/genome.1.bt2" log: "logs/bowtie2_build.log" singularity: PEAKSNAKE_HOME + "/sifs/bowtie2_2.5.2.sif" threads: 6 shell: "bowtie2-build --threads {threads} {input.genome} ref/genome 1>{log} 2>&1" ########################################################## # 使用 bowtie2 将测序数据比对到参考基因组 # fastq => sam ########################################################## rule bowtie2_align: input: fastq=["clean_data/{replicate}_R1.fastq.gz", "clean_data/{replicate}_R2.fastq.gz"] if SEQ_TYPE == 'PE' else ["clean_data/{replicate}_R1.fastq.gz"], genome_index=config["bowtie2_index"] + ".1.bt2" if config["bowtie2_index"] else "ref/genome.1.bt2" output: sam="raw_bams/{replicate}.sam" log: "logs/{replicate}_bowtie2_align.log" singularity: PEAKSNAKE_HOME + "/sifs/bowtie2_2.5.2.sif" threads: 4 params: genome_index=config["bowtie2_index"] if config["bowtie2_index"] else "ref/genome", fastq="-1 clean_data/{replicate}_R1.fastq.gz -2 clean_data/{replicate}_R2.fastq.gz" if SEQ_TYPE == 'PE' else "-U clean_data/{replicate}_R1.fastq.gz", bowtie2=config["bowtie2"] if config["bowtie2"] else "" shell: """ bowtie2 -p {threads} -x {params.genome_index} {params.fastq} -S {output.sam} {params.bowtie2} 1>{log} 2>&1 """ rule sort_bam: input: sam="raw_bams/{replicate}.sam" output: sorted_bam="raw_bams/{replicate}_sorted.bam" log: "logs/{replicate}_sort_bam.log" singularity: PEAKSNAKE_HOME + "/sifs/commonTools_20231218.sif" threads: 4 shell: """ samtools sort -@ {threads} -o {output.sorted_bam} {input.sam} 1>{log} 2>&1 samtools index {output.sorted_bam} """ rule sieve_alignment: input: bam="raw_bams/{replicate}_sorted.bam" output: bam="clean_bams/{replicate}_final.bam" log: "logs/{replicate}_sieve_alignment.log" threads: 4 singularity: PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif" params: peaksnake_home=PEAKSNAKE_HOME, minMappingQuality=config["alignmentSieve"]["minMappingQuality"], blacklist="--blackListFileName " + config["alignmentSieve"]["blacklist"] if config["alignmentSieve"]["blacklist"] else [], extra=config["alignmentSieve"]["extra"] if config["alignmentSieve"]["extra"] else "" shell: """ alignmentSieve --numberOfProcessors {threads} --bam {input.bam} --outFile {output.bam} --filterMetrics {log} --ignoreDuplicates --minMappingQuality {params.minMappingQuality} --samFlagExclude 260 {params.blacklist} {params.extra} 1>{log} 2>&1 samtools index {output.bam} """ rule estimate_read_filtering: input: bams=expand("raw_bams/{replicate}_sorted.bam", replicate=REPLICATES + INPUTS) output: stat="quality_control/estimate_read_filtering.tsv" threads: 4 singularity: PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif" log: "logs/estimate_read_filtering.log" params: peaksnake_home=PEAKSNAKE_HOME, minMappingQuality=config["alignmentSieve"]["minMappingQuality"], blacklist="--blackListFileName " + config["alignmentSieve"]["blacklist"] if config["alignmentSieve"]["blacklist"] else "", extra=config["alignmentSieve"]["extra"] if config["alignmentSieve"]["extra"] else "", sampleLabels=REPLICATES + INPUTS shell: """ estimateReadFiltering \ --numberOfProcessors {threads} \ --bam {input.bams} \ --sampleLabels {params.sampleLabels} \ --outFile {output.stat} \ --ignoreDuplicates \ --minMappingQuality {params.minMappingQuality} \ --samFlagExclude 260 \ {params.blacklist} \ {params.extra} 1>{log} 2>&1 """ ########################################################## # DeepTools 绘图 ########################################################## rule convert_bam_to_bigwig: input: bam="clean_bams/{replicate}_final.bam" output: bw="clean_bams/{replicate}.bw" log: "logs/{replicate}_convert_bam_to_bigwig.log" singularity: PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif" threads: 2 params: gsize=config["gsize"], bamCoverage = config["bamCoverage"] if config["bamCoverage"] else "" shell: """ bamCoverage -p {threads} --bam {input.bam} -o {output.bw} --effectiveGenomeSize {params.gsize} {params.bamCoverage} 1>{log} 2>&1 """ rule summarize_multiple_bigwigs: input: bws=expand("clean_bams/{replicate}.bw", replicate=REPLICATES + INPUTS), tss_tes_bed=config["tss_tes_bed"] output: "clean_bams/bw_summary.gz" params: labels=REPLICATES + INPUTS log: "logs/summarize_multiple_bigwigs.log" singularity: PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif" threads: 10 shell: """ multiBigwigSummary BED-file \ --bwfiles {input.bws} \ --labels {params.labels} \ --BED {input.tss_tes_bed} \ -o {output} -p {threads} 1>{log} 2>&1 """ rule generate_correlation_plots: input: "clean_bams/bw_summary.gz" output: pdf="deeptools/sample_correlation.pdf", tab="deeptools/sample_correlation.tab" log: "logs/generate_correlation_plots.log" singularity: PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif" shell: """ plotCorrelation -in {input} -o {output.pdf} \ --corMethod pearson --whatToPlot heatmap \ -min 0.5 \ --plotTitle "Pearson Correlation of Samples" \ --outFileCorMatrix {output.tab} 1>{log} 2>&1 """ rule plot_heatmap_reference_point: input: bws=expand("clean_bams/{replicate}.bw", replicate=REPLICATES + INPUTS), tss_tes_bed=config["tss_tes_bed"], tss_tes_shuffle_bed=config["tss_tes_shuffle_bed"] output: tss_matrix="deeptools/tss_matrix.gz", tss_heatmap="deeptools/tss_heatmap.pdf" params: labels=REPLICATES + INPUTS log: "logs/plot_heatmap_reference_point.log" singularity: PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif" threads: 10 shell: """ computeMatrix reference-point \ -S {input.bws} \ --samplesLabel {params.labels} \ -R {input.tss_tes_bed} {input.tss_tes_shuffle_bed} \ --referencePoint TSS \ -b 5000 -a 5000 \ --binSize 50 \ -o {output.tss_matrix} \ -p {threads} 1>{log} 2>&1 plotHeatmap -m {output.tss_matrix} -o {output.tss_heatmap} --missingDataColor 0.5 1>>{log} 2>&1 """ rule plot_heatmap_scale_regions: input: bws=expand("clean_bams/{replicate}.bw", replicate=REPLICATES + INPUTS), tss_tes_bed=config["tss_tes_bed"], tss_tes_shuffle_bed=config["tss_tes_shuffle_bed"] output: tss_tes_matrix="deeptools/tss_tes_matrix.gz", tss_tes_heatmap="deeptools/tss_tes_heatmap.pdf" params: labels=REPLICATES + INPUTS log: "logs/plot_heatmap_scale_regions.log" singularity: PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif" threads: 10 shell: """ computeMatrix scale-regions \ -S {input.bws} \ --samplesLabel {params.labels} \ -R {input.tss_tes_bed} {input.tss_tes_shuffle_bed} \ --regionBodyLength 4000 \ -b 2000 -a 2000 \ --binSize 50 \ -o {output.tss_tes_matrix} \ -p {threads} 1>{log} 2>&1 plotHeatmap -m {output.tss_tes_matrix} -o {output.tss_tes_heatmap} --missingDataColor 0.5 1>>{log} 2>&1 """ ########################################################## # 对每个 replicate:input call peak ########################################################## # 规则:MACS3 call peak rule callpeak_with_macs3: input: sorted_ip_bam="clean_bams/{replicate}_final.bam", sorted_input_bam=lambda wildcards: f"clean_bams/{config['samples'][wildcards.replicate]}_final.bam" if config['samples'][wildcards.replicate] else [] output: Peak="raw_peaks/{replicate}_peaks." + PEAK_TYPE + "Peak", cutoff_analysis_txt="raw_peaks/{replicate}_cutoff_analysis.txt" log: "logs/{replicate}_callpeak_with_macs3.log" singularity: PEAKSNAKE_HOME + "/sifs/macs3_idr_20231218.sif" threads: 1 params: control=lambda wildcards: f"-c clean_bams/{config['samples'][wildcards.replicate]}_final.bam" if config['samples'][wildcards.replicate] else "", format="BAMPE" if SEQ_TYPE == 'PE' and '--nomodel' not in config["macs3"]["extra"] else "BAM", gsize=config["gsize"], PEAK_TYPE="--broad" if PEAK_TYPE == "broad" else "", extra = config["macs3"]["extra"] if config["macs3"]["extra"] else "" shell: """ macs3 callpeak -g {params.gsize} -t {input.sorted_ip_bam} {params.control} --name raw_peaks/{wildcards.replicate} --format {params.format} --keep-dup all --qvalue 0.075 --cutoff-analysis {params.PEAK_TYPE} {params.extra} 1>{log} 2>&1 """ rule plot_macs_cutoff_analysis: input: cutoff_analysis_txt="raw_peaks/{replicate}_cutoff_analysis.txt" output: cutoff_analysis_pdf="raw_peaks/{replicate}_cutoff_analysis.pdf" log: "logs/{replicate}_plot_macs_cutoff_analysis.log" shell: """ python {PEAKSNAKE_HOME}/scripts/plot_macs_cutoff_analysis.py {input.cutoff_analysis_txt} {output} 1>{log} 2>&1 """ rule filter_peaks_by_qscore: input: Peak="raw_peaks/{replicate}_peaks." + PEAK_TYPE + "Peak" output: Peak="clean_peaks/cutoff/{replicate}_peaks." + PEAK_TYPE + "Peak" params: qscore=config["macs3"]["qscore"] shell: """ awk '$9>{params.qscore}' {input.Peak} > {output.Peak} """ rule select_peaks_norep: input: Peak="clean_peaks/cutoff/{sample}_rep1_peaks." + PEAK_TYPE + "Peak" output: Peak="clean_peaks/norep/{sample}_peaks." + PEAK_TYPE + "Peak" shell: """ cp {input.Peak} {output.Peak} """ rule select_peaks_by_intersect: input: Peak=lambda wildcards: expand("clean_peaks/cutoff/" + wildcards.sample + "_{rep}_peaks." + PEAK_TYPE + "Peak", rep=SAMPLE_TO_REPLICATE[wildcards.sample]) output: Peak="clean_peaks/intersect/{sample}_peaks." + PEAK_TYPE + "Peak" params: min_overlap=config['intersect']['min_overlap'] shell: """ # 检查输入文件的数量 num_peaks=$(echo {input.Peak} | wc -w) # 如果只有一个输入文件 if [ "$num_peaks" -eq 1 ]; then cp {input.Peak[0]} {output.Peak} else # 复制第一个输入文件到临时文件 cp {input.Peak[0]} {wildcards.sample}.temp.bed # 使用除第一个之外的所有输入文件 echo {input.Peak} | tr ' ' '\\n' | awk 'NR>1' | while read bed; do bedtools intersect -f {params.min_overlap} -r -a {wildcards.sample}.temp.bed -b $bed -wa > {wildcards.sample}.temp2.bed mv {wildcards.sample}.temp2.bed {wildcards.sample}.temp.bed done # 创建一个中间的 peak list 文件 cut -f 4 {wildcards.sample}.temp.bed > {wildcards.sample}.peak_lst # 使用中间的 peak list 文件和第一个输入文件生成最终输出 awk 'NR==FNR {{a[$1]; next}} $4 in a' {wildcards.sample}.peak_lst {input.Peak[0]} > {output.Peak} # 清理临时文件 rm {wildcards.sample}.temp.bed {wildcards.sample}.peak_lst fi """ rule select_peaks_by_idr: input: rep1_peaks="raw_peaks/{sample}_rep1_peaks." + PEAK_TYPE + "Peak", rep2_peaks="raw_peaks/{sample}_rep2_peaks." + PEAK_TYPE + "Peak" output: true_rep_idr="clean_peaks/idr/{sample}_true_rep_idr.txt", idr_peaks="clean_peaks/idr/{sample}_peaks." + PEAK_TYPE + "Peak" log: "logs/{sample}_select_peaks_by_idr.log" singularity: PEAKSNAKE_HOME + "/sifs/macs3_idr_20231218.sif" threads: 1 params: PEAK_TYPE=PEAK_TYPE, idr_threshold=config["idr"]["idr_threshold"] shell: """ idr --samples {input.rep1_peaks} {input.rep2_peaks} --idr-threshold {params.idr_threshold} --output-file {output.true_rep_idr} --plot --input-file-type {PEAK_TYPE}Peak --rank p.value 1>{log} 2>&1 awk -v OFS="\\t" 'BEGIN {{FS=OFS}} {{ $4=$1":"$2"_"$3; print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10 }}' {output.true_rep_idr} | sort -k1,1 -k2,2n -k3,3n > {output.idr_peaks} """ ########################################################## # 提取序列 ########################################################## rule extract_peak_sequence: input: Peak="clean_peaks/{m}/{sample}_peaks." + PEAK_TYPE + "Peak" output: peak_fa="clean_peaks/{m}/{sample}_peaks.fa" params: top_n=config["motif"]["top_n"], summit_flank=config["motif"]["summit_flank"], genome=config["genome"], summit_fa="clean_peaks/{m}/{sample}_summit.fa" shell: """ set +o pipefail; # Sorting and extracting fasta for peak sort -k8,8nr {input.Peak} | head -n {params.top_n} | bedtools getfasta -fi {params.genome} -bed - > {output.peak_fa} # Handling narrow peaks if [[ "{PEAK_TYPE}" == "narrow" ]]; then sort -k8,8nr {input.Peak} | head -n {params.top_n} | awk -v OFS='\\t' -v flank={params.summit_flank} '{{print $1, $2+$10-flank, $2+$10+flank+1}}' | awk '$2>=0' | bedtools getfasta -fi {params.genome} -bed - > {params.summit_fa} fi """ ########################################################## # Peak 注释 ########################################################## rule peak_annotation_with_uropa: input: Peak="clean_peaks/" + PEAK_SELECTION + "/{sample}_peaks." + PEAK_TYPE + "Peak", gtf=config["gtf"] output: allhits="peak_annotation/{sample}_peaks_allhits.txt", finalhits="peak_annotation/{sample}_peaks_finalhits.txt" log: "logs/{sample}_peak_annotation_with_uropa.log" params: feature=config["uropa"]["feature"], feature_anchor=config["uropa"]["feature_anchor"], distance=config["uropa"]["distance"], relative_location=config["uropa"]["relative_location"] singularity: PEAKSNAKE_HOME + "/sifs/uropa_4.0.3--pyhdfd78af_0.sif" shell: """ uropa --bed {input.Peak} --gtf {input.gtf} --feature {params.feature} --feature-anchor {params.feature_anchor} --distance {params.distance} --relative-location {params.relative_location} --show-attributes gene_id --outdir peak_annotation 1>{log} 2>&1 """ ########################################################## # 生成所有 samples 共识 peaks ########################################################## rule merge_peaks: input: sample_peaks=expand("clean_peaks/" + PEAK_SELECTION + "/{sample}_peaks." + PEAK_TYPE + "Peak", sample=SAMPLE_TO_REPLICATE.keys()) output: merged_peaks_bed="clean_peaks/merge/merged_peaks.bed", merged_peaks_saf="clean_peaks/merge/merged_peaks.saf" singularity: PEAKSNAKE_HOME + "/sifs/commonTools_20231218.sif" params: fai=config['genome'] + ".fai" shell: """ mkdir -p clean_peaks/merge cat {input.sample_peaks} > clean_peaks/merge/cat_peaks.bed bedtools sort -i clean_peaks/merge/cat_peaks.bed -g {params.fai} > clean_peaks/merge/cat_sorted_peaks.bed bedtools merge -i clean_peaks/merge/cat_sorted_peaks.bed > {output.merged_peaks_bed} awk 'OFS="\t" {{print $1":"$2"-"$3, $1, $2+1, $3, "."}}' {output.merged_peaks_bed} > {output.merged_peaks_saf} """ rule merged_peak_annotation_with_uropa: input: Peak="clean_peaks/merge/merged_peaks.bed", gtf=config["gtf"] output: allhits="peak_annotation/merged_peaks_allhits.txt", finalhits="peak_annotation/merged_peaks_finalhits.txt" log: "logs/merged_peak_annotation_with_uropa.log" params: feature=config["uropa"]["feature"], feature_anchor=config["uropa"]["feature_anchor"], distance=config["uropa"]["distance"], relative_location=config["uropa"]["relative_location"] singularity: PEAKSNAKE_HOME + "/sifs/uropa_4.0.3--pyhdfd78af_0.sif" shell: """ uropa --bed {input.Peak} --gtf {input.gtf} --feature {params.feature} --feature-anchor {params.feature_anchor} --distance {params.distance} --relative-location {params.relative_location} --show-attributes gene_id --outdir peak_annotation 1>{log} 2>&1 """ ########################################################## # 使用 feature counts 计算 replicate peak counts ########################################################## rule run_feature_counts: input: bam="clean_bams/{replicate}_final.bam", merged_peaks_saf="clean_peaks/merge/merged_peaks.saf" output: counts="counts/{replicate}.count", stat="counts/{replicate}.log" log: "logs/{replicate}_run_feature_counts.log" params: peaksnake_home=PEAKSNAKE_HOME, isPairedEnd="TRUE" if SEQ_TYPE == 'PE' else "FALSE" shell: """ Rscript {params.peaksnake_home}/software/RunFeatureCounts/run-featurecounts.R -b {input.bam} --saf {input.merged_peaks_saf} --isPairedEnd {params.isPairedEnd} -o counts/{wildcards.replicate} 1>{log} 2>&1 """ ########################################################## # 合并生成 replicate counts 矩阵 ########################################################## rule create_count_matrix: input: expand("counts/{replicate}.count", replicate=REPLICATES) output: counts_matrix="counts/merged_peaks.counts.matrix", cpm_matrix="counts/merged_peaks.TMM.CPM.matrix" log: "logs/create_count_matrix.log" params: peaksnake_home=PEAKSNAKE_HOME shell: """ ls {input} >counts/count.fofn perl {params.peaksnake_home}/software/RunFeatureCounts/abundance_estimates_to_matrix.pl --est_method featureCounts --quant_files counts/count.fofn --out_prefix counts/merged_peaks 1>{log} 2>&1 """ ########################################################## # 使用 DESeq2/ edgeR 进行差异分析 ########################################################## if config["contrasts"]: # 生成 samples.txt with open('samples.txt', 'w') as file: for ip in REPLICATES: # 提取 sample name sample_name = '_'.join(ip.split('_')[:2]) file.write(f"{sample_name}\t{ip}\n") # 生成 contrasts.txt with open("contrasts.txt", "w") as file: file.write("\n".join([" ".join(item.split("_vs_")) for item in config["contrasts"]])) rule run_deseq2: input: "counts/merged_peaks.counts.matrix" output: expand("diff_peaks/{contrast}.DESeq2.DE_results", contrast=config["contrasts"]) log: "logs/run_deseq2.log" params: extra=config["diff_peaks"]["extra"] singularity: PEAKSNAKE_HOME + "/sifs/trinity_20231218.sif" shell: """ run_DE_analysis.pl -m {input} --method DESeq2 -s samples.txt --contrasts contrasts.txt -o diff_peaks {params.extra} 1>{log} 2>&1 cd diff_peaks/ rm *count_matrix *.MA_n_Volcano.pdf *.Rscript for file in merged_peaks.counts.matrix.*.DE_results; do echo -e "PeakID\tsampleA\tsampleB\tlog2FoldChange\tPValue\tPadj" >"${{file#merged_peaks.counts.matrix.}}" grep -v "^sampleA" $file | awk 'BEGIN {{OFS="\t"}} {{print $1,$2,$3,$7,$10,$11}}' >> "${{file#merged_peaks.counts.matrix.}}" rm $file done cd .. """ rule run_edgeR: input: "counts/merged_peaks.counts.matrix" output: expand("diff_peaks/{contrast}.edgeR.DE_results", contrast=config["contrasts"]) log: "logs/run_edgeR.log" params: extra=config["diff_peaks"]["extra"] singularity: PEAKSNAKE_HOME + "/sifs/trinity_20231218.sif" shell: """ run_DE_analysis.pl -m {input} --method edgeR -s samples.txt --contrasts contrasts.txt -o diff_peaks {params.extra} 1>{log} 2>&1 cd diff_peaks/ rm *count_matrix *.MA_n_Volcano.pdf *.Rscript for file in merged_peaks.counts.matrix.*.DE_results; do echo -e "PeakID\tsampleA\tsampleB\tlog2FoldChange\tPValue\tPadj" >"${{file#merged_peaks.counts.matrix.}}" grep -v "^sampleA" $file | awk 'BEGIN {{OFS="\t"}} {{print $1,$2,$3,$4,$6,$7}}' >> "${{file#merged_peaks.counts.matrix.}}" rm $file done cd .. """ ########################################################## # 合并fastp质控结果生成表格 ########################################################## rule combine_fastp_reports: input: expand("clean_data/{replicate}_fastp.json", replicate=REPLICATES + INPUTS) output: "quality_control/fastp_summary.tsv" log: "logs/combine_fastp_reports.log" params: peaksnake_home=PEAKSNAKE_HOME shell: """ Rscript {params.peaksnake_home}/scripts/combine_fastp_report.R --input {input} --output quality_control/ 1>{log} 2>&1 """ ########################################################## # 使用 TOBIAS 进行足迹分析 ########################################################## rule merge_replicate_bams: input: bams=lambda wildcards: expand("clean_bams/{sample}_{rep}_final.bam", sample = wildcards.sample, rep=SAMPLE_TO_REPLICATE[wildcards.sample]) output: bam="clean_bams/{sample}_merged.bam" shell: """ samtools merge -o {output.bam} {input.bams} samtools index {output.bam} """ rule ATACorrect: input: bam="clean_bams/{sample}_merged.bam", genome=config["genome"], merged_peaks_bed="clean_peaks/merge/merged_peaks.bed" output: bw="footprinting/{sample}_corrected.bw" params: prefix="{sample}" threads: 8 log: "logs/{sample}_ATACorrect.log" singularity: PEAKSNAKE_HOME + "/sifs/tobias_20231231.sif" shell: """ TOBIAS ATACorrect --bam {input.bam} --genome {input.genome} --peaks {input.merged_peaks_bed} --outdir footprinting --prefix {params.prefix} --cores {threads} 1>{log} 2>&1 """ rule FootprintScores: input: bw="footprinting/{sample}_corrected.bw", merged_peaks_bed="clean_peaks/merge/merged_peaks.bed" output: bw="footprinting/{sample}_footprints.bw" params: prefix={sample} threads: 8 log: "logs/{sample}_FootprintScores.log" singularity: PEAKSNAKE_HOME + "/sifs/tobias_20231231.sif" shell: """ TOBIAS ScoreBigwig --signal {input.bw} --regions {input.merged_peaks_bed} --output {output.bw} --cores {threads} 1>{log} 2>&1 """ rule BINDetect: input: bws=expand("footprinting/{sample}_footprints.bw", sample=SAMPLE_TO_REPLICATE.keys()), merged_peaks_bed="clean_peaks/merge/merged_peaks.bed", genome=config["genome"] output: "footprinting/bindetect_results.txt", expand("footprinting/{motif}/beds/{motif}_all.bed", motif=get_motifs_from_meme(config["footprint"]["motif"])), expand("footprinting/{motif}/beds/{motif}_{sample}_{b}.bed", motif=get_motifs_from_meme(config["footprint"]["motif"]), sample=SAMPLE_TO_REPLICATE.keys(), b=["bound", "unbound"]) params: samples=" ".join(SAMPLE_TO_REPLICATE.keys()), motif_databases=config["footprint"]["motif"] threads: 8 log: "logs/BINDetect.log" singularity: PEAKSNAKE_HOME + "/sifs/tobias_20231231.sif" shell: """ TOBIAS BINDetect --motifs {params.motif_databases} --signals {input.bws} --genome {input.genome} --peaks {input.merged_peaks_bed} --outdir footprinting --cond_names {params.samples} --cores {threads} 1>{log} 2>&1 """ ########################################################## # cross correlation ########################################################## rule cross_correlation: input: bam="clean_bams/{replicate}_final.bam" output: pdf="quality_control/cross_correlation/{replicate}.pdf", tsv="quality_control/cross_correlation/{replicate}.tsv" log: "logs/{replicate}_cross_correlation.log" singularity: PEAKSNAKE_HOME + "/sifs/phantompeakqualtools_20231218.sif" params: spp=config['cross_correlation']['spp'] if config['cross_correlation']['spp'] else "" threads: 4 shell: """ run_spp.R -p={threads} -c={input.bam} -savp={output.pdf} -out={output.tsv} -rf {params.spp} 1>{log} 2>&1 """ rule cross_correlation_summary: input: expand("quality_control/cross_correlation/{replicate}.tsv", replicate=REPLICATES + INPUTS) output: "quality_control/cross_correlation_summary.tsv" shell: """ echo -e "Sample\\tTotalReadsFiltered\\tFragmentLength\\tCrossCorrelation\\tPhantomPeakLocation\\tPhantomPeakCorrelation\\tMinimumCrossCorrelationShift\\tMinimumCrossCorrelationValue\\tNSC\\tRSC\\tPhantomPeakQualityTag" >{output} cat {input} | sort -k 1 >>{output} """