소스 검색

add footprint analysis

zhangxudong 10 달 전
부모
커밋
cf10f909cb
2개의 변경된 파일116개의 추가작업 그리고 15개의 파일을 삭제
  1. 99 1
      Snakefile
  2. 17 14
      config.yaml

+ 99 - 1
Snakefile

@@ -59,6 +59,16 @@ with open("sample_sheet.csv", 'w') as f:
 ##########################################################
 # rule all: 最终想要生成的文件 
 ##########################################################
+def get_motifs_from_meme(meme_file):
+    motifs = []
+    with open(meme_file, 'r') as file:
+        for line in file:
+            if line.startswith("MOTIF"):
+                parts = line.split()
+                motif_name = parts[2] + "_" + parts[1]  # 第二列和第三列的组合
+                motifs.append(motif_name)
+    return motifs
+
 rule all:
     input:
 	#####################################
@@ -105,6 +115,11 @@ rule all:
         "quality_control/estimate_read_filtering.tsv",
         # cross correlation 质控表格
         "quality_control/cross_correlation_summary.tsv" if config["cross_correlation"]["do"] else [],
+        #####################################
+        # 足迹分析
+        #####################################
+        "footprinting/bindetect_results.txt",
+        expand("footprinting/{motif}/{motif}_Plot{plot}.pdf", motif=get_motifs_from_meme("motif_databases/JASPAR/JASPAR2022_CORE_plants_non-redundant_v2.meme"), plot=["Aggregate", "Heatmap"])
 
 ##########################################################
 # 使用fastp进行原始数据质控和过滤,自动判断 SE?PE?
@@ -366,7 +381,7 @@ rule callpeak_with_macs3:
     threads: 1
     params:
         control=lambda wildcards: f"-c clean_bams/{config['samples'][wildcards.replicate]}_final.bam" if config['samples'][wildcards.replicate] else "",
-        format="BAMPE" if SEQ_TYPE == 'PE' else "BAM",
+        format="BAMPE" if SEQ_TYPE == 'PE' and '--nomodel' not in config["macs3"]["extra"] else "BAM",
         gsize=config["gsize"],
         PEAK_TYPE="--broad" if PEAK_TYPE == "broad" else "",
         extra = config["macs3"]["extra"] if config["macs3"]["extra"] else ""
@@ -672,6 +687,89 @@ rule combine_fastp_reports:
         Rscript {params.peaksnake_home}/scripts/combine_fastp_report.R --input {input} --output quality_control/ 1>{log} 2>&1
         """
 
+##########################################################
+# 使用 TOBIAS 进行足迹分析
+##########################################################
+rule merge_replicate_bams:
+    input:
+        bams=lambda wildcards: expand("clean_bams/{sample}_{rep}_final.bam", sample = wildcards.sample, rep=SAMPLE_TO_REPLICATE[wildcards.sample])
+    output:
+        bam="clean_bams/{sample}_merged.bam"
+    shell:
+        """
+        samtools merge -o {output.bam} {input.bams}
+        samtools index {output.bam}
+        """
+
+rule ATACorrect:
+    input:
+        bam="clean_bams/{sample}_merged.bam",
+        genome=config["genome"],
+        merged_peaks_bed="clean_peaks/merge/merged_peaks.bed"
+    output:
+        bw="footprinting/{sample}_corrected.bw"
+    params: 
+        prefix="{sample}"
+    threads: 8
+    log: "logs/{sample}_ATACorrect.log"
+    singularity:
+        PEAKSNAKE_HOME + "/sifs/tobias_20231231.sif"
+    shell:
+        """
+        TOBIAS ATACorrect --window 10 --bam {input.bam} --genome {input.genome} --peaks {input.merged_peaks_bed} --outdir footprinting --prefix {params.prefix} --cores {threads} 1>{log} 2>&1
+        """
+
+rule FootprintScores:
+    input:
+        bw="footprinting/{sample}_corrected.bw",
+        merged_peaks_bed="clean_peaks/merge/merged_peaks.bed"
+    output:
+        bw="footprinting/{sample}_footprints.bw"
+    params:
+        prefix={sample}
+    threads: 8
+    log: "logs/{sample}_FootprintScores.log"
+    singularity:
+        PEAKSNAKE_HOME + "/sifs/tobias_20231231.sif"
+    shell:
+        """
+        TOBIAS FootprintScores --window 10 --signal {input.bw} --regions {input.merged_peaks_bed} --output {output.bw} --cores {threads} 1>{log} 2>&1
+        """
+
+rule BINDetect:
+    input:
+        bws=expand("footprinting/{sample}_footprints.bw", sample=SAMPLE_TO_REPLICATE.keys()),
+        merged_peaks_bed="clean_peaks/merge/merged_peaks.bed",
+        genome=config["genome"]
+    output:
+        "footprinting/bindetect_results.txt"
+    params:
+        samples=" ".join(SAMPLE_TO_REPLICATE.keys())
+    threads: 8
+    log: "logs/BINDetect.log"
+    singularity:
+        PEAKSNAKE_HOME + "/sifs/tobias_20231231.sif"
+    shell:
+        """
+        TOBIAS BINDetect --motifs motif_databases/JASPAR/JASPAR2022_CORE_plants_non-redundant_v2.meme --signals {input.bws} --genome {input.genome} --peaks {input.merged_peaks_bed} --outdir footprinting --cond_names {params.samples} --cores {threads} 
+        """
+
+rule plot_per_motif:
+    input:
+        motif_bed="footprinting/{motif}/beds/{motif}_all.bed",
+        bws=expand("footprinting/{sample}_corrected.bw", sample=SAMPLE_TO_REPLICATE.keys())
+    output:
+        aggregate_pdf="footprinting/{motif}/{motif}_PlotAggregate.pdf",
+        heatmap_pdf="footprinting/{motif}/{motif}_PlotHeatmap.pdf"
+    threads: 1
+    singularity:
+        PEAKSNAKE_HOME + "/sifs/tobias_20231231.sif"
+    shell:
+        """
+        TOBIAS PlotAggregate --TFBS {input.motif_bed} --signals {input.bws} --output {output.aggregate_pdf} --share_y both --plot_boundaries --signal-on-x
+        TOBIAS PlotHeatmap --TFBS {input.motif_bed} --signals {input.bws} --output {output.heatmap_pdf} --sort_by -2
+        """
+
 ##########################################################
 # cross correlation
 ##########################################################

+ 17 - 14
config.yaml

@@ -6,24 +6,27 @@ genome: ref/genome.fa
 gtf: ref/genes_longest.gtf 
 tss_tes_bed: ref/genes_longest.bed6  
 tss_tes_shuffle_bed: ref/genes_shuffle.bed6
-seq_type: SE  # 或者 "PE"
-peak_type: narrow # narrow | broad
+seq_type: PE  # 或者 "PE"
+peak_type: broad # narrow | broad
 # 有效基因组大小, 参考 https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html#effective-genome-size
 gsize: 119481543 
 # idr: 只支持 2 重复,适合 narrowPeak) ; intersect (2 + 重复 ) ; norep: 没有重复 
-peak_selection: idr
+peak_selection: intersect
 
 #######################
 # samples  
 #######################
 samples:
-  ZAT6_ABA_rep1: Col0_mock_rep1
-  ZAT6_ABA_rep2: Col0_mock_rep2
-  H3K4me3_WT_rep1: Input_WT_rep1
-  H3K4me3_WT_rep2: Input_WT_rep1
+  bZIP11_CHXDEX_rep1: 
+  bZIP11_CHXDEX_rep2:
+  bZIP11_CHXMOCK_rep1:
+  bZIP11_CHXMOCK_rep2:
+  WT_CHXDEX_rep1:
+  WT_CHXDEX_rep2:
 
 contrasts:
-  - ZAT6_ABA_vs_H3K4me3_WT
+  - bZIP11_CHXDEX_vs_bZIP11_CHXMOCK
+  - bZIP11_CHXDEX_vs_WT_CHXDEX
 
 #######################
 # fastp  
@@ -36,7 +39,7 @@ fastp: "--adapter_sequence AGATCGGAAGAGCACACGTCTGAACTCCAGTCA --adapter_sequence_
 # ATAC-seq 必须设置 --maxins 1000 ~ 2000
 #######################
 bowtie2_index: ref/genome  # 不设置则从genome.fa 构建
-bowtie2:
+bowtie2: --no-mixed --no-discordant --maxins 1000
 
 #######################
 # alignmentSieve 
@@ -62,7 +65,7 @@ bamCoverage: "--binSize 50 --normalizeUsing RPKM"
 #######################
 macs3: 
   qscore: 3
-  extra: 
+  extra: --nomodel --shift -100 --extsize 200 
 
 #######################
 # replicate intersect 
@@ -91,9 +94,9 @@ motif:
 uropa:
   do: yes
   feature: ['gene']
-  feature_anchor: ['start'] # ['start', 'center', 'end']
-  relative_location: ['Upstream', 'OverlapStart'] # ['Upstream', 'Downstream', 'OverlapStart', 'OverlapEnd','PeakInsideFeature', 'FeatureInsidePeak']
-  distance: [5000, 5000]
+  feature_anchor: ['start', 'center', 'end']
+  relative_location:  ['Upstream', 'Downstream', 'OverlapStart', 'OverlapEnd','PeakInsideFeature', 'FeatureInsidePeak']
+  distance: [100000, 100000]
 
 #######################
 # diff analysis
@@ -108,5 +111,5 @@ diff_peaks:
 # ChIP-seq 需要进行, ATAC-seq 不需要 
 #######################
 cross_correlation:
-  do: yes
+  do: no
   spp: "-s=0:5:500"