Bladeren bron

add footprint analysis

zhangxudong 10 maanden geleden
bovenliggende
commit
f957766762
2 gewijzigde bestanden met toevoegingen van 91 en 93 verwijderingen
  1. 70 75
      Snakefile
  2. 21 18
      config.yaml

+ 70 - 75
Snakefile

@@ -1,5 +1,6 @@
 import os
 import yaml
+import pandas as pd
 
 configfile: "config.yaml"
 ##########################################################
@@ -7,6 +8,7 @@ configfile: "config.yaml"
 ##########################################################
 ## 软件主目录
 PEAKSNAKE_HOME = config["PEAKSNAKE_HOME"] if config["PEAKSNAKE_HOME"] else os.getcwd()
+## 关键参数
 PEAK_TYPE = config["peak_type"]
 SEQ_TYPE = config["seq_type"]
 PEAK_SELECTION = config["peak_selection"]
@@ -19,6 +21,7 @@ REPLICATE_TO_INPUT = {k: config['samples'][k] for k in sorted(config['samples'])
 
 # 列表:所有 REPLICATES 
 REPLICATES = sorted(list(set(REPLICATE_TO_INPUT.keys())))
+
 # 列表:所有 INPUTS 
 INPUTS = sorted(list(set(REPLICATE_TO_INPUT.values())))
 INPUTS = [] if INPUTS == [None] else INPUTS
@@ -65,7 +68,12 @@ def get_motifs_from_meme(meme_file):
         for line in file:
             if line.startswith("MOTIF"):
                 parts = line.split()
-                motif_name = parts[2] + "_" + parts[1]  # 第二列和第三列的组合
+                # 检查是否有三列,且第三列不是空字符串
+                if len(parts) == 3 and parts[2]:
+                    motif_name = parts[2] + "_" + parts[1]  # 第三列和第二列的组合
+                # 检查是否有两列或第三列为空
+                elif len(parts) >= 2:
+                    motif_name = "_" + parts[1]  # 只使用第二列
                 motifs.append(motif_name)
     return motifs
 
@@ -74,8 +82,14 @@ rule all:
 	#####################################
 	# 从 fastq 到 peaks 
 	#####################################
+        # 测序数据质控表格
+        "quality_control/fastp_summary.tsv",
         # 最终比对结果
         expand("clean_bams/{replicate}_final.bam", replicate = REPLICATES + INPUTS),
+        # 比对质控表格
+        "quality_control/estimate_read_filtering.tsv",
+        # cross correlation 质控表格
+        "quality_control/cross_correlation_summary.tsv" if config["cross_correlation"]["do"] else [],
         # bw 文件, deeptools 可视化
         expand("clean_bams/{replicate}.bw", replicate = REPLICATES + INPUTS),
         "deeptools/sample_correlation.pdf" if len(REPLICATES) > 2 else [],
@@ -92,34 +106,27 @@ rule all:
         # 提取序列
         expand("clean_peaks/{m}/{sample}_peaks.fa", sample=SAMPLE_TO_REPLICATE.keys(), m = PEAK_SELECTION),
         # 所有样本共识peaks
-        "clean_peaks/merge/merged_peaks.bed" if len(SAMPLE_TO_REPLICATE) > 2 else [],
-        #####################################
-        # Peak 注释
-        #####################################
-        expand("peak_annotation/{sample}_peaks_allhits.txt", sample = SAMPLE_TO_REPLICATE.keys()) if config["uropa"]["do"] else [],
-        expand("peak_annotation/{sample}_peaks_finalhits.txt", sample = SAMPLE_TO_REPLICATE.keys()) if config["uropa"]["do"] else [],
+        "clean_peaks/merge/merged_peaks.bed",
         #####################################
         # 定量分析
         #####################################
         "counts/merged_peaks.counts.matrix",
         "counts/merged_peaks.TMM.CPM.matrix",
-	# 差异分析
+        #####################################
+        # Peak 注释
+        #####################################
+        # 对单个样本 peaks 注释
+        expand("peak_annotation/{sample}_peaks_{hit}.txt", sample = SAMPLE_TO_REPLICATE.keys(), hit = ["allhits", "finalhits"]) if config["uropa"]["do"] else [],
+        # 对 merge 之后的 peaks 注释
+        expand("peak_annotation/merged_peaks_{hit}.txt", hit = ["allhits", "finalhits"]) if config["uropa"]["do"] else [],
+        #####################################
+        # 差异分析
+        #####################################
 	expand("diff_peaks/{contrast}.{m}.DE_results", contrast=config["contrasts"], m=config["diff_peaks"]["method"]) if config["diff_peaks"]["do"] and config["contrasts"] else [],
-        expand("diff_peaks_annotation/{contrast}.bed6", contrast=config["contrasts"]) if config["diff_peaks"]["do"] and config["contrasts"] and config["uropa"]["do"] else [],
-	#####################################
-	# 质控结果 
-	#####################################
-        # 测序数据质控表格
-        "quality_control/fastp_summary.tsv",
-        # 比对质控表格
-        "quality_control/estimate_read_filtering.tsv",
-        # cross correlation 质控表格
-        "quality_control/cross_correlation_summary.tsv" if config["cross_correlation"]["do"] else [],
         #####################################
         # 足迹分析
         #####################################
-        "footprinting/bindetect_results.txt",
-        expand("footprinting/{motif}/{motif}_Plot{plot}.pdf", motif=get_motifs_from_meme("motif_databases/JASPAR/JASPAR2022_CORE_plants_non-redundant_v2.meme"), plot=["Aggregate", "Heatmap"])
+        "footprinting/bindetect_results.txt" if config["footprint"]["do"] else []
 
 ##########################################################
 # 使用fastp进行原始数据质控和过滤,自动判断 SE?PE?
@@ -243,7 +250,7 @@ rule estimate_read_filtering:
 	--ignoreDuplicates \
 	--minMappingQuality {params.minMappingQuality} \
 	--samFlagExclude 260 \
-	--blackListFileName ref/blacklist.bed \
+	{params.blacklist} \
         {params.extra} 1>{log} 2>&1
         """
 
@@ -485,14 +492,14 @@ rule select_peaks_by_idr:
 ##########################################################
 rule extract_peak_sequence:
     input:
-        Peak="clean_peaks/" + PEAK_SELECTION + "/{sample}_peaks." + PEAK_TYPE + "Peak"
+        Peak="clean_peaks/{m}/{sample}_peaks." + PEAK_TYPE + "Peak"
     output:
-        peak_fa="clean_peaks/" + PEAK_SELECTION + "/{sample}_peaks.fa"
+        peak_fa="clean_peaks/{m}/{sample}_peaks.fa"
     params:
         top_n=config["motif"]["top_n"],
         summit_flank=config["motif"]["summit_flank"],
         genome=config["genome"],
-        summit_fa="clean_peaks/" + PEAK_SELECTION + "/{sample}_summit.fa"
+        summit_fa="clean_peaks/{m}/{sample}_summit.fa"
     shell:
         """
         set +o pipefail;
@@ -548,7 +555,27 @@ rule merge_peaks:
         cat {input.sample_peaks} > clean_peaks/merge/cat_peaks.bed
         bedtools sort -i clean_peaks/merge/cat_peaks.bed -g {params.fai} > clean_peaks/merge/cat_sorted_peaks.bed
         bedtools merge -i clean_peaks/merge/cat_sorted_peaks.bed > {output.merged_peaks_bed}
-        awk 'OFS="\t" {{print $1":"$2+1"-"$3, $1, $2+1, $3, "."}}' {output.merged_peaks_bed} > {output.merged_peaks_saf}
+        awk 'OFS="\t" {{print $1":"$2"-"$3, $1, $2+1, $3, "."}}' {output.merged_peaks_bed} > {output.merged_peaks_saf}
+        """
+
+rule merged_peak_annotation_with_uropa:
+    input:
+        Peak="clean_peaks/merge/merged_peaks.bed",
+        gtf=config["gtf"]
+    output:
+        allhits="peak_annotation/merged_peaks_allhits.txt",
+        finalhits="peak_annotation/merged_peaks_finalhits.txt"
+    log: "logs/merged_peak_annotation_with_uropa.log"
+    params:
+        feature=config["uropa"]["feature"],
+        feature_anchor=config["uropa"]["feature_anchor"],
+        distance=config["uropa"]["distance"],
+        relative_location=config["uropa"]["relative_location"]
+    singularity:
+        PEAKSNAKE_HOME + "/sifs/uropa_4.0.3--pyhdfd78af_0.sif"
+    shell:
+        """
+        uropa --bed {input.Peak} --gtf {input.gtf} --feature {params.feature} --feature-anchor {params.feature_anchor} --distance {params.distance} --relative-location {params.relative_location} --show-attributes gene_id --outdir peak_annotation 1>{log} 2>&1
         """
 
 ##########################################################
@@ -610,17 +637,19 @@ rule run_deseq2:
     output:
         expand("diff_peaks/{contrast}.DESeq2.DE_results", contrast=config["contrasts"])    
     log: "logs/run_deseq2.log"
+    params:
+        extra=config["diff_peaks"]["extra"]
     singularity:
         PEAKSNAKE_HOME + "/sifs/trinity_20231218.sif"
     shell:
         """
-        run_DE_analysis.pl -m {input} --method DESeq2 -s samples.txt --contrasts contrasts.txt -o diff_peaks 1>{log} 2>&1
+        run_DE_analysis.pl -m {input} --method DESeq2 -s samples.txt --contrasts contrasts.txt -o diff_peaks {params.extra} 1>{log} 2>&1
 
         cd diff_peaks/
         rm *count_matrix *.MA_n_Volcano.pdf *.Rscript
         for file in merged_peaks.counts.matrix.*.DE_results; do
-            echo -e "PeakID\tsampleA\tsampleB\tFoldChange\tPValue\tPadj" >"${{file#merged_peaks.counts.matrix.}}"
-            grep -v "^sampleA" $file | awk 'BEGIN {{OFS="\t"}} {{ $7 = exp(log(2) * $7); print $1,$2,$3,$7,$10,$11}}' >> "${{file#merged_peaks.counts.matrix.}}"
+            echo -e "PeakID\tsampleA\tsampleB\tlog2FoldChange\tPValue\tPadj" >"${{file#merged_peaks.counts.matrix.}}"
+            grep -v "^sampleA" $file | awk 'BEGIN {{OFS="\t"}} {{print $1,$2,$3,$7,$10,$11}}' >> "${{file#merged_peaks.counts.matrix.}}"
             rm $file
         done
         cd ..
@@ -632,45 +661,24 @@ rule run_edgeR:
     output:
         expand("diff_peaks/{contrast}.edgeR.DE_results", contrast=config["contrasts"])
     log: "logs/run_edgeR.log"
+    params: 
+        extra=config["diff_peaks"]["extra"]
     singularity:
         PEAKSNAKE_HOME + "/sifs/trinity_20231218.sif"
     shell:
         """
-        run_DE_analysis.pl -m {input} --method edgeR -s samples.txt --contrasts contrasts.txt -o diff_peaks 1>{log} 2>&1
+        run_DE_analysis.pl -m {input} --method edgeR -s samples.txt --contrasts contrasts.txt -o diff_peaks {params.extra} 1>{log} 2>&1
 
         cd diff_peaks/
         rm *count_matrix *.MA_n_Volcano.pdf *.Rscript
         for file in merged_peaks.counts.matrix.*.DE_results; do
-            echo -e "PeakID\tsampleA\tsampleB\tFoldChange\tPValue\tPadj" >"${{file#merged_peaks.counts.matrix.}}"
-            grep -v "^sampleA" $file | awk 'BEGIN {{OFS="\t"}} {{ $4 = exp(log(2) * $4); print $1,$2,$3,$4,$6,$7}}' >> "${{file#merged_peaks.counts.matrix.}}"
+            echo -e "PeakID\tsampleA\tsampleB\tlog2FoldChange\tPValue\tPadj" >"${{file#merged_peaks.counts.matrix.}}"
+            grep -v "^sampleA" $file | awk 'BEGIN {{OFS="\t"}} {{print $1,$2,$3,$4,$6,$7}}' >> "${{file#merged_peaks.counts.matrix.}}"
             rm $file 
         done
         cd .. 
         """
 
-rule diff_peaks_annotation_with_uropa:
-    input:
-        diff_peaks="diff_peaks/{contrast}.edgeR.DE_results",
-        gtf=config["gtf"]
-    output:
-        diff_peaks_bed="diff_peaks_annotation/{contrast}.bed6",
-        allhits="diff_peaks_annotation/{contrast}_allhits.txt",
-        finalhits="diff_peaks_annotation/{contrast}_finalhits.txt"
-    log: "logs/{contrast}_diff_peaks_annotation_with_uropa.log"
-    params:
-        diff_peaks_padj=config["diff_peaks"]["padj"],
-        feature_anchor=config["uropa"]["feature_anchor"],
-        distance=config["uropa"]["distance"],
-        relative_location=config["uropa"]["relative_location"]
-    singularity:
-        PEAKSNAKE_HOME + "/sifs/uropa_4.0.3--pyhdfd78af_0.sif"
-    shell:
-        """
-        awk 'BEGIN {{FS="\\t"; OFS="\t"}} NR>1 && $6 < {params.diff_peaks_padj} {{split($1, arr, "[:-]"); print arr[1], arr[2]-1, arr[3], $1, $4, "."}}' {input.diff_peaks} > {output.diff_peaks_bed}
-
-        uropa --bed {output.diff_peaks_bed} --gtf {input.gtf} --feature gene --feature-anchor {params.feature_anchor} --distance {params.distance} --relative-location {params.relative_location} --outdir diff_peaks_annotation 1>{log} 2>&1
-        """
-
 ##########################################################
 # 合并fastp质控结果生成表格
 ##########################################################
@@ -716,7 +724,7 @@ rule ATACorrect:
         PEAKSNAKE_HOME + "/sifs/tobias_20231231.sif"
     shell:
         """
-        TOBIAS ATACorrect --window 10 --bam {input.bam} --genome {input.genome} --peaks {input.merged_peaks_bed} --outdir footprinting --prefix {params.prefix} --cores {threads} 1>{log} 2>&1
+        TOBIAS ATACorrect --bam {input.bam} --genome {input.genome} --peaks {input.merged_peaks_bed} --outdir footprinting --prefix {params.prefix} --cores {threads} 1>{log} 2>&1
         """
 
 rule FootprintScores:
@@ -733,7 +741,7 @@ rule FootprintScores:
         PEAKSNAKE_HOME + "/sifs/tobias_20231231.sif"
     shell:
         """
-        TOBIAS FootprintScores --window 10 --signal {input.bw} --regions {input.merged_peaks_bed} --output {output.bw} --cores {threads} 1>{log} 2>&1
+        TOBIAS ScoreBigwig --signal {input.bw} --regions {input.merged_peaks_bed} --output {output.bw} --cores {threads} 1>{log} 2>&1
         """
 
 rule BINDetect:
@@ -742,32 +750,19 @@ rule BINDetect:
         merged_peaks_bed="clean_peaks/merge/merged_peaks.bed",
         genome=config["genome"]
     output:
-        "footprinting/bindetect_results.txt"
+        "footprinting/bindetect_results.txt",
+        expand("footprinting/{motif}/beds/{motif}_all.bed", motif=get_motifs_from_meme(config["footprint"]["motif"])),
+        expand("footprinting/{motif}/beds/{motif}_{sample}_{b}.bed", motif=get_motifs_from_meme(config["footprint"]["motif"]), sample=SAMPLE_TO_REPLICATE.keys(), b=["bound", "unbound"])
     params:
-        samples=" ".join(SAMPLE_TO_REPLICATE.keys())
+        samples=" ".join(SAMPLE_TO_REPLICATE.keys()),
+        motif_databases=config["footprint"]["motif"]
     threads: 8
     log: "logs/BINDetect.log"
     singularity:
         PEAKSNAKE_HOME + "/sifs/tobias_20231231.sif"
     shell:
         """
-        TOBIAS BINDetect --motifs motif_databases/JASPAR/JASPAR2022_CORE_plants_non-redundant_v2.meme --signals {input.bws} --genome {input.genome} --peaks {input.merged_peaks_bed} --outdir footprinting --cond_names {params.samples} --cores {threads} 
-        """
-
-rule plot_per_motif:
-    input:
-        motif_bed="footprinting/{motif}/beds/{motif}_all.bed",
-        bws=expand("footprinting/{sample}_corrected.bw", sample=SAMPLE_TO_REPLICATE.keys())
-    output:
-        aggregate_pdf="footprinting/{motif}/{motif}_PlotAggregate.pdf",
-        heatmap_pdf="footprinting/{motif}/{motif}_PlotHeatmap.pdf"
-    threads: 1
-    singularity:
-        PEAKSNAKE_HOME + "/sifs/tobias_20231231.sif"
-    shell:
-        """
-        TOBIAS PlotAggregate --TFBS {input.motif_bed} --signals {input.bws} --output {output.aggregate_pdf} --share_y both --plot_boundaries --signal-on-x
-        TOBIAS PlotHeatmap --TFBS {input.motif_bed} --signals {input.bws} --output {output.heatmap_pdf} --sort_by -2
+        TOBIAS BINDetect --motifs {params.motif_databases} --signals {input.bws} --genome {input.genome} --peaks {input.merged_peaks_bed} --outdir footprinting --cond_names {params.samples} --cores {threads} 1>{log} 2>&1 
         """
 
 ##########################################################

+ 21 - 18
config.yaml

@@ -2,31 +2,26 @@
 # globe  
 #######################
 PEAKSNAKE_HOME: /home/zhxd/workspace/20231225_ChIP_ATAC/PeakSnake
-genome: ref/genome.fa
+genome: ref/hg38.fa
 gtf: ref/genes_longest.gtf 
 tss_tes_bed: ref/genes_longest.bed6  
 tss_tes_shuffle_bed: ref/genes_shuffle.bed6
 seq_type: PE  # 或者 "PE"
 peak_type: broad # narrow | broad
 # 有效基因组大小, 参考 https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html#effective-genome-size
-gsize: 119481543 
+gsize: 2913022398
 # idr: 只支持 2 重复,适合 narrowPeak) ; intersect (2 + 重复 ) ; norep: 没有重复 
-peak_selection: intersect
+peak_selection: norep
 
 #######################
 # samples  
 #######################
 samples:
-  bZIP11_CHXDEX_rep1: 
-  bZIP11_CHXDEX_rep2:
-  bZIP11_CHXMOCK_rep1:
-  bZIP11_CHXMOCK_rep2:
-  WT_CHXDEX_rep1:
-  WT_CHXDEX_rep2:
+  Diabetic_hs_rep1:
+  Healthy_hs_rep1:
 
 contrasts:
-  - bZIP11_CHXDEX_vs_bZIP11_CHXMOCK
-  - bZIP11_CHXDEX_vs_WT_CHXDEX
+  - Diabetic_hs_vs_Healthy_hs
 
 #######################
 # fastp  
@@ -44,17 +39,16 @@ bowtie2: --no-mixed --no-discordant --maxins 1000
 #######################
 # alignmentSieve 
 # blackregion: 黑名单区域 线粒体 叶绿体 大肠杆菌等
-# ATAC-seq 在 extra 中添加 --ATACshift 
 #######################
 alignmentSieve:
   minMappingQuality: 25
-  blacklist: ref/blacklist.bed
+  blacklist: ref/hg38.blacklist.bed
   extra:
 
 #######################
 # bam to bw
 #######################
-bamCoverage: "--binSize 50 --normalizeUsing RPKM"
+bamCoverage: "--binSize 10 --normalizeUsing RPKM"
 
 #######################
 # MACS3
@@ -62,10 +56,11 @@ bamCoverage: "--binSize 50 --normalizeUsing RPKM"
 # qscore: 用于过滤 peaks
 # macs3 callpeak ChIP-seq 一般不需要设置特殊参数
 # ATAC-seq 推荐: --nomodel --shift -100 --extsize 200 或 --nomodel --shift -75 --extsize 150
+# --SPMR -B 用于生成单碱基的 bdg 文件, 可以转换为 bw 文件,有可视化的需求可以加
 #######################
 macs3: 
   qscore: 3
-  extra: --nomodel --shift -100 --extsize 200 
+  extra: --nomodel --shift -100 --extsize 200 --SPMR -B 
 
 #######################
 # replicate intersect 
@@ -85,7 +80,7 @@ idr:
 motif:
   top_n: 1000
   summit_flank: 50
-
+  databases: motif_databases/HUMAN/HOCOMOCOv11_core_HUMAN_mono_meme_format.meme
 #######################
 # Peank 注释
 # 只关注 TSS 附近 用 ChIPseeker
@@ -94,9 +89,16 @@ motif:
 uropa:
   do: yes
   feature: ['gene']
-  feature_anchor: ['start', 'center', 'end']
+  feature_anchor: ['start']
   relative_location:  ['Upstream', 'Downstream', 'OverlapStart', 'OverlapEnd','PeakInsideFeature', 'FeatureInsidePeak']
-  distance: [100000, 100000]
+  distance: [2000, 500]
+
+#######################
+# footprinting
+#######################
+footprint:
+  do: yes
+  motif: motif_databases/HUMAN/HOCOMOCOv11_core_HUMAN_mono_meme_format.meme
 
 #######################
 # diff analysis
@@ -105,6 +107,7 @@ diff_peaks:
   do: yes
   method: edgeR # edgeR | DESeq2
   padj: 0.05
+  extra: --dispersion 0.1
 
 #######################
 # cross correlation