Snakefile 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802
  1. import os
  2. import yaml
  3. configfile: "config.yaml"
  4. ##########################################################
  5. # 全局变量和样本信息
  6. ##########################################################
  7. ## 软件主目录
  8. PEAKSNAKE_HOME = config["PEAKSNAKE_HOME"] if config["PEAKSNAKE_HOME"] else os.getcwd()
  9. PEAK_TYPE = config["peak_type"]
  10. SEQ_TYPE = config["seq_type"]
  11. PEAK_SELECTION = config["peak_selection"]
  12. GENOME = config["genome"]
  13. GTF = config["gtf"]
  14. ## 样本信息变量
  15. # 字典:REPLICATE to INPUT
  16. REPLICATE_TO_INPUT = {k: config['samples'][k] for k in sorted(config['samples'])}
  17. # 列表:所有 REPLICATES
  18. REPLICATES = sorted(list(set(REPLICATE_TO_INPUT.keys())))
  19. # 列表:所有 INPUTS
  20. INPUTS = sorted(list(set(REPLICATE_TO_INPUT.values())))
  21. INPUTS = [] if INPUTS == [None] else INPUTS
  22. # 字典: SAMPLE 与 rep1 rep2 rep2 对应关系
  23. SAMPLE_TO_REPLICATE = {}
  24. for s in REPLICATES:
  25. name, rep = '_'.join(s.split('_')[:-1]), s.split('_')[-1]
  26. SAMPLE_TO_REPLICATE.setdefault(name, []).append(rep)
  27. SAMPLE_TO_REPLICATE = {k: sorted(v) for k, v in SAMPLE_TO_REPLICATE.items()}
  28. ## 生成样本信息表
  29. with open("sample_sheet.csv", 'w') as f:
  30. f.write("SampleID,ControlID,Tissue,Factor,Condition,Treatment,Replicate,bamReads,Peaks,bamControl,PeakCaller\n")
  31. for sample, control in REPLICATE_TO_INPUT.items():
  32. sample_parts = sample.split('_')
  33. factor = sample_parts[0] # 提取样本ID中的Factor
  34. tissue = "NA"
  35. treatment = sample_parts[1] # 在这个例子中,tissue和condition是相同的
  36. condition = factor + "_" + treatment
  37. replicate = sample_parts[2].replace("rep", "") # 将"rep1"和"rep2"转换为"1"和"2"
  38. if control:
  39. control_parts = control.split('_')
  40. control_id = "_".join(control_parts[:2]) # 构建 ControlID
  41. bamControl = f"clean_bams/{control}_final.bam"
  42. else:
  43. control_id = "NA"
  44. bamControl = "NA"
  45. bamReads = f"clean_bams/{sample}_final.bam"
  46. peaks = f"clean_peaks/cutoff/{sample}_peaks.{PEAK_TYPE}Peak"
  47. f.write(f"{sample},{control_id},{tissue},{factor},{condition},{treatment},{replicate},{bamReads},{peaks},{bamControl},bed\n")
  48. ##########################################################
  49. # rule all: 最终想要生成的文件
  50. ##########################################################
  51. def get_motifs_from_meme(meme_file):
  52. motifs = []
  53. with open(meme_file, 'r') as file:
  54. for line in file:
  55. if line.startswith("MOTIF"):
  56. parts = line.split()
  57. motif_name = parts[2] + "_" + parts[1] # 第二列和第三列的组合
  58. motifs.append(motif_name)
  59. return motifs
  60. rule all:
  61. input:
  62. #####################################
  63. # 从 fastq 到 peaks
  64. #####################################
  65. # 最终比对结果
  66. expand("clean_bams/{replicate}_final.bam", replicate = REPLICATES + INPUTS),
  67. # bw 文件, deeptools 可视化
  68. expand("clean_bams/{replicate}.bw", replicate = REPLICATES + INPUTS),
  69. "deeptools/sample_correlation.pdf" if len(REPLICATES) > 2 else [],
  70. "deeptools/tss_heatmap.pdf",
  71. "deeptools/tss_tes_heatmap.pdf",
  72. # raw peak 结果
  73. expand("raw_peaks/{replicate}_peaks.{PEAK_TYPE}Peak", replicate = REPLICATES, PEAK_TYPE = PEAK_TYPE),
  74. # cutoff analysis
  75. expand("raw_peaks/{replicate}_cutoff_analysis.pdf", replicate = REPLICATES),
  76. # clean peak 结果
  77. expand("clean_peaks/cutoff/{replicate}_peaks.{PEAK_TYPE}Peak", replicate = REPLICATES, PEAK_TYPE = PEAK_TYPE),
  78. # 通过 intersect or idr 进行 peak 筛选
  79. expand("clean_peaks/{m}/{sample}_peaks.{PEAK_TYPE}Peak", sample=SAMPLE_TO_REPLICATE.keys(), m = PEAK_SELECTION, PEAK_TYPE = PEAK_TYPE),
  80. # 提取序列
  81. expand("clean_peaks/{m}/{sample}_peaks.fa", sample=SAMPLE_TO_REPLICATE.keys(), m = PEAK_SELECTION),
  82. # 所有样本共识peaks
  83. "clean_peaks/merge/merged_peaks.bed" if len(SAMPLE_TO_REPLICATE) > 2 else [],
  84. #####################################
  85. # Peak 注释
  86. #####################################
  87. expand("peak_annotation/{sample}_peaks_allhits.txt", sample = SAMPLE_TO_REPLICATE.keys()) if config["uropa"]["do"] else [],
  88. expand("peak_annotation/{sample}_peaks_finalhits.txt", sample = SAMPLE_TO_REPLICATE.keys()) if config["uropa"]["do"] else [],
  89. #####################################
  90. # 定量分析
  91. #####################################
  92. "counts/merged_peaks.counts.matrix",
  93. "counts/merged_peaks.TMM.CPM.matrix",
  94. # 差异分析
  95. expand("diff_peaks/{contrast}.{m}.DE_results", contrast=config["contrasts"], m=config["diff_peaks"]["method"]) if config["diff_peaks"]["do"] and config["contrasts"] else [],
  96. expand("diff_peaks_annotation/{contrast}.bed6", contrast=config["contrasts"]) if config["diff_peaks"]["do"] and config["contrasts"] and config["uropa"]["do"] else [],
  97. #####################################
  98. # 质控结果
  99. #####################################
  100. # 测序数据质控表格
  101. "quality_control/fastp_summary.tsv",
  102. # 比对质控表格
  103. "quality_control/estimate_read_filtering.tsv",
  104. # cross correlation 质控表格
  105. "quality_control/cross_correlation_summary.tsv" if config["cross_correlation"]["do"] else [],
  106. #####################################
  107. # 足迹分析
  108. #####################################
  109. "footprinting/bindetect_results.txt",
  110. expand("footprinting/{motif}/{motif}_Plot{plot}.pdf", motif=get_motifs_from_meme("motif_databases/JASPAR/JASPAR2022_CORE_plants_non-redundant_v2.meme"), plot=["Aggregate", "Heatmap"])
  111. ##########################################################
  112. # 使用fastp进行原始数据质控和过滤,自动判断 SE?PE?
  113. # raw_data => clean_data
  114. ##########################################################
  115. rule fastp_quality_control:
  116. input:
  117. fastq=["raw_data/{replicate}_R1.fastq.gz", "raw_data/{replicate}_R2.fastq.gz"] if SEQ_TYPE == 'PE' else ["raw_data/{replicate}_R1.fastq.gz"]
  118. output:
  119. fastq=["clean_data/{replicate}_R1.fastq.gz", "clean_data/{replicate}_R2.fastq.gz"] if SEQ_TYPE == 'PE' else ["clean_data/{replicate}_R1.fastq.gz"],
  120. html_report="clean_data/{replicate}_fastp.html",
  121. json_report="clean_data/{replicate}_fastp.json"
  122. log: "logs/{replicate}_fastp_quality_control.log"
  123. threads: 3
  124. singularity:
  125. PEAKSNAKE_HOME + "/sifs/commonTools_20231218.sif"
  126. params:
  127. fastq="-i raw_data/{replicate}_R1.fastq.gz -o clean_data/{replicate}_R1.fastq.gz -I raw_data/{replicate}_R2.fastq.gz -O clean_data/{replicate}_R2.fastq.gz" if SEQ_TYPE == 'PE' else "-i raw_data/{replicate}_R1.fastq.gz -o clean_data/{replicate}_R1.fastq.gz",
  128. fastp=config["fastp"]
  129. shell:
  130. """
  131. fastp {params.fastq} --html {output.html_report} --json {output.json_report} --thread {threads} {params.fastp} 1>{log} 2>&1
  132. """
  133. ##########################################################
  134. # 根据基因组构建 bowtie2 index
  135. ##########################################################
  136. rule bowtie2_build:
  137. input:
  138. genome = GENOME
  139. output:
  140. index_prefix = "ref/genome.1.bt2"
  141. log: "logs/bowtie2_build.log"
  142. singularity:
  143. PEAKSNAKE_HOME + "/sifs/bowtie2_2.5.2.sif"
  144. threads: 6
  145. shell:
  146. "bowtie2-build --threads {threads} {input.genome} ref/genome 1>{log} 2>&1"
  147. ##########################################################
  148. # 使用 bowtie2 将测序数据比对到参考基因组
  149. # fastq => sam
  150. ##########################################################
  151. rule bowtie2_align:
  152. input:
  153. fastq=["clean_data/{replicate}_R1.fastq.gz", "clean_data/{replicate}_R2.fastq.gz"] if SEQ_TYPE == 'PE' else ["clean_data/{replicate}_R1.fastq.gz"],
  154. genome_index=config["bowtie2_index"] + ".1.bt2" if config["bowtie2_index"] else "ref/genome.1.bt2"
  155. output:
  156. sam="raw_bams/{replicate}.sam"
  157. log: "logs/{replicate}_bowtie2_align.log"
  158. singularity:
  159. PEAKSNAKE_HOME + "/sifs/bowtie2_2.5.2.sif"
  160. threads: 4
  161. params:
  162. genome_index=config["bowtie2_index"] if config["bowtie2_index"] else "ref/genome",
  163. fastq="-1 clean_data/{replicate}_R1.fastq.gz -2 clean_data/{replicate}_R2.fastq.gz" if SEQ_TYPE == 'PE' else "-U clean_data/{replicate}_R1.fastq.gz",
  164. bowtie2=config["bowtie2"] if config["bowtie2"] else ""
  165. shell:
  166. """
  167. bowtie2 -p {threads} -x {params.genome_index} {params.fastq} -S {output.sam} {params.bowtie2} 1>{log} 2>&1
  168. """
  169. rule sort_bam:
  170. input:
  171. sam="raw_bams/{replicate}.sam"
  172. output:
  173. sorted_bam="raw_bams/{replicate}_sorted.bam"
  174. log: "logs/{replicate}_sort_bam.log"
  175. singularity:
  176. PEAKSNAKE_HOME + "/sifs/commonTools_20231218.sif"
  177. threads: 4
  178. shell:
  179. """
  180. samtools sort -@ {threads} -o {output.sorted_bam} {input.sam} 1>{log} 2>&1
  181. samtools index {output.sorted_bam}
  182. """
  183. rule sieve_alignment:
  184. input:
  185. bam="raw_bams/{replicate}_sorted.bam"
  186. output:
  187. bam="clean_bams/{replicate}_final.bam"
  188. log: "logs/{replicate}_sieve_alignment.log"
  189. threads: 4
  190. singularity:
  191. PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif"
  192. params:
  193. peaksnake_home=PEAKSNAKE_HOME,
  194. minMappingQuality=config["alignmentSieve"]["minMappingQuality"],
  195. blacklist="--blackListFileName " + config["alignmentSieve"]["blacklist"] if config["alignmentSieve"]["blacklist"] else [],
  196. extra=config["alignmentSieve"]["extra"] if config["alignmentSieve"]["extra"] else ""
  197. shell:
  198. """
  199. alignmentSieve --numberOfProcessors {threads} --bam {input.bam} --outFile {output.bam} --filterMetrics {log} --ignoreDuplicates --minMappingQuality {params.minMappingQuality} --samFlagExclude 260 {params.blacklist} {params.extra} 1>{log} 2>&1
  200. samtools index {output.bam}
  201. """
  202. rule estimate_read_filtering:
  203. input:
  204. bams=expand("raw_bams/{replicate}_sorted.bam", replicate=REPLICATES + INPUTS)
  205. output:
  206. stat="quality_control/estimate_read_filtering.tsv"
  207. threads: 4
  208. singularity:
  209. PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif"
  210. log: "logs/estimate_read_filtering.log"
  211. params:
  212. peaksnake_home=PEAKSNAKE_HOME,
  213. minMappingQuality=config["alignmentSieve"]["minMappingQuality"],
  214. blacklist="--blackListFileName " + config["alignmentSieve"]["blacklist"] if config["alignmentSieve"]["blacklist"] else "",
  215. extra=config["alignmentSieve"]["extra"] if config["alignmentSieve"]["extra"] else "",
  216. sampleLabels=REPLICATES + INPUTS
  217. shell:
  218. """
  219. estimateReadFiltering \
  220. --numberOfProcessors {threads} \
  221. --bam {input.bams} \
  222. --sampleLabels {params.sampleLabels} \
  223. --outFile {output.stat} \
  224. --ignoreDuplicates \
  225. --minMappingQuality {params.minMappingQuality} \
  226. --samFlagExclude 260 \
  227. --blackListFileName ref/blacklist.bed \
  228. {params.extra} 1>{log} 2>&1
  229. """
  230. ##########################################################
  231. # DeepTools 绘图
  232. ##########################################################
  233. rule convert_bam_to_bigwig:
  234. input:
  235. bam="clean_bams/{replicate}_final.bam"
  236. output:
  237. bw="clean_bams/{replicate}.bw"
  238. log: "logs/{replicate}_convert_bam_to_bigwig.log"
  239. singularity:
  240. PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif"
  241. threads: 2
  242. params:
  243. gsize=config["gsize"],
  244. bamCoverage = config["bamCoverage"] if config["bamCoverage"] else ""
  245. shell:
  246. """
  247. bamCoverage -p {threads} --bam {input.bam} -o {output.bw} --effectiveGenomeSize {params.gsize} {params.bamCoverage} 1>{log} 2>&1
  248. """
  249. rule summarize_multiple_bigwigs:
  250. input:
  251. bws=expand("clean_bams/{replicate}.bw", replicate=REPLICATES + INPUTS),
  252. tss_tes_bed=config["tss_tes_bed"]
  253. output:
  254. "clean_bams/bw_summary.gz"
  255. params:
  256. labels=REPLICATES + INPUTS
  257. log: "logs/summarize_multiple_bigwigs.log"
  258. singularity:
  259. PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif"
  260. threads: 10
  261. shell:
  262. """
  263. multiBigwigSummary BED-file \
  264. --bwfiles {input.bws} \
  265. --labels {params.labels} \
  266. --BED {input.tss_tes_bed} \
  267. -o {output} -p {threads} 1>{log} 2>&1
  268. """
  269. rule generate_correlation_plots:
  270. input:
  271. "clean_bams/bw_summary.gz"
  272. output:
  273. pdf="deeptools/sample_correlation.pdf",
  274. tab="deeptools/sample_correlation.tab"
  275. log: "logs/generate_correlation_plots.log"
  276. singularity:
  277. PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif"
  278. shell:
  279. """
  280. plotCorrelation -in {input} -o {output.pdf} \
  281. --corMethod pearson --whatToPlot heatmap \
  282. -min 0.5 \
  283. --plotTitle "Pearson Correlation of Samples" \
  284. --outFileCorMatrix {output.tab} 1>{log} 2>&1
  285. """
  286. rule plot_heatmap_reference_point:
  287. input:
  288. bws=expand("clean_bams/{replicate}.bw", replicate=REPLICATES + INPUTS),
  289. tss_tes_bed=config["tss_tes_bed"],
  290. tss_tes_shuffle_bed=config["tss_tes_shuffle_bed"]
  291. output:
  292. tss_matrix="deeptools/tss_matrix.gz",
  293. tss_heatmap="deeptools/tss_heatmap.pdf"
  294. params:
  295. labels=REPLICATES + INPUTS
  296. log: "logs/plot_heatmap_reference_point.log"
  297. singularity:
  298. PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif"
  299. threads: 10
  300. shell:
  301. """
  302. computeMatrix reference-point \
  303. -S {input.bws} \
  304. --samplesLabel {params.labels} \
  305. -R {input.tss_tes_bed} {input.tss_tes_shuffle_bed} \
  306. --referencePoint TSS \
  307. -b 5000 -a 5000 \
  308. --binSize 50 \
  309. -o {output.tss_matrix} \
  310. -p {threads} 1>{log} 2>&1
  311. plotHeatmap -m {output.tss_matrix} -o {output.tss_heatmap} --missingDataColor 0.5 1>>{log} 2>&1
  312. """
  313. rule plot_heatmap_scale_regions:
  314. input:
  315. bws=expand("clean_bams/{replicate}.bw", replicate=REPLICATES + INPUTS),
  316. tss_tes_bed=config["tss_tes_bed"],
  317. tss_tes_shuffle_bed=config["tss_tes_shuffle_bed"]
  318. output:
  319. tss_tes_matrix="deeptools/tss_tes_matrix.gz",
  320. tss_tes_heatmap="deeptools/tss_tes_heatmap.pdf"
  321. params:
  322. labels=REPLICATES + INPUTS
  323. log: "logs/plot_heatmap_scale_regions.log"
  324. singularity:
  325. PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif"
  326. threads: 10
  327. shell:
  328. """
  329. computeMatrix scale-regions \
  330. -S {input.bws} \
  331. --samplesLabel {params.labels} \
  332. -R {input.tss_tes_bed} {input.tss_tes_shuffle_bed} \
  333. --regionBodyLength 4000 \
  334. -b 2000 -a 2000 \
  335. --binSize 50 \
  336. -o {output.tss_tes_matrix} \
  337. -p {threads} 1>{log} 2>&1
  338. plotHeatmap -m {output.tss_tes_matrix} -o {output.tss_tes_heatmap} --missingDataColor 0.5 1>>{log} 2>&1
  339. """
  340. ##########################################################
  341. # 对每个 replicate:input call peak
  342. ##########################################################
  343. # 规则:MACS3 call peak
  344. rule callpeak_with_macs3:
  345. input:
  346. sorted_ip_bam="clean_bams/{replicate}_final.bam",
  347. sorted_input_bam=lambda wildcards: f"clean_bams/{config['samples'][wildcards.replicate]}_final.bam" if config['samples'][wildcards.replicate] else []
  348. output:
  349. Peak="raw_peaks/{replicate}_peaks." + PEAK_TYPE + "Peak",
  350. cutoff_analysis_txt="raw_peaks/{replicate}_cutoff_analysis.txt"
  351. log: "logs/{replicate}_callpeak_with_macs3.log"
  352. singularity:
  353. PEAKSNAKE_HOME + "/sifs/macs3_idr_20231218.sif"
  354. threads: 1
  355. params:
  356. control=lambda wildcards: f"-c clean_bams/{config['samples'][wildcards.replicate]}_final.bam" if config['samples'][wildcards.replicate] else "",
  357. format="BAMPE" if SEQ_TYPE == 'PE' and '--nomodel' not in config["macs3"]["extra"] else "BAM",
  358. gsize=config["gsize"],
  359. PEAK_TYPE="--broad" if PEAK_TYPE == "broad" else "",
  360. extra = config["macs3"]["extra"] if config["macs3"]["extra"] else ""
  361. shell:
  362. """
  363. macs3 callpeak -g {params.gsize} -t {input.sorted_ip_bam} {params.control} --name raw_peaks/{wildcards.replicate} --format {params.format} --keep-dup all --qvalue 0.075 --cutoff-analysis {params.PEAK_TYPE} {params.extra} 1>{log} 2>&1
  364. """
  365. rule plot_macs_cutoff_analysis:
  366. input:
  367. cutoff_analysis_txt="raw_peaks/{replicate}_cutoff_analysis.txt"
  368. output:
  369. cutoff_analysis_pdf="raw_peaks/{replicate}_cutoff_analysis.pdf"
  370. log: "logs/{replicate}_plot_macs_cutoff_analysis.log"
  371. shell:
  372. """
  373. python {PEAKSNAKE_HOME}/scripts/plot_macs_cutoff_analysis.py {input.cutoff_analysis_txt} {output} 1>{log} 2>&1
  374. """
  375. rule filter_peaks_by_qscore:
  376. input:
  377. Peak="raw_peaks/{replicate}_peaks." + PEAK_TYPE + "Peak"
  378. output:
  379. Peak="clean_peaks/cutoff/{replicate}_peaks." + PEAK_TYPE + "Peak"
  380. params:
  381. qscore=config["macs3"]["qscore"]
  382. shell:
  383. """
  384. awk '$9>{params.qscore}' {input.Peak} > {output.Peak}
  385. """
  386. rule select_peaks_norep:
  387. input:
  388. Peak="clean_peaks/cutoff/{sample}_rep1_peaks." + PEAK_TYPE + "Peak"
  389. output:
  390. Peak="clean_peaks/norep/{sample}_peaks." + PEAK_TYPE + "Peak"
  391. shell:
  392. """
  393. cp {input.Peak} {output.Peak}
  394. """
  395. rule select_peaks_by_intersect:
  396. input:
  397. Peak=lambda wildcards: expand("clean_peaks/cutoff/" + wildcards.sample + "_{rep}_peaks." + PEAK_TYPE + "Peak", rep=SAMPLE_TO_REPLICATE[wildcards.sample])
  398. output:
  399. Peak="clean_peaks/intersect/{sample}_peaks." + PEAK_TYPE + "Peak"
  400. params:
  401. min_overlap=config['intersect']['min_overlap']
  402. shell:
  403. """
  404. # 检查输入文件的数量
  405. num_peaks=$(echo {input.Peak} | wc -w)
  406. # 如果只有一个输入文件
  407. if [ "$num_peaks" -eq 1 ]; then
  408. cp {input.Peak[0]} {output.Peak}
  409. else
  410. # 复制第一个输入文件到临时文件
  411. cp {input.Peak[0]} {wildcards.sample}.temp.bed
  412. # 使用除第一个之外的所有输入文件
  413. echo {input.Peak} | tr ' ' '\\n' | awk 'NR>1' | while read bed; do
  414. bedtools intersect -f {params.min_overlap} -r -a {wildcards.sample}.temp.bed -b $bed -wa > {wildcards.sample}.temp2.bed
  415. mv {wildcards.sample}.temp2.bed {wildcards.sample}.temp.bed
  416. done
  417. # 创建一个中间的 peak list 文件
  418. cut -f 4 {wildcards.sample}.temp.bed > {wildcards.sample}.peak_lst
  419. # 使用中间的 peak list 文件和第一个输入文件生成最终输出
  420. awk 'NR==FNR {{a[$1]; next}} $4 in a' {wildcards.sample}.peak_lst {input.Peak[0]} > {output.Peak}
  421. # 清理临时文件
  422. rm {wildcards.sample}.temp.bed {wildcards.sample}.peak_lst
  423. fi
  424. """
  425. rule select_peaks_by_idr:
  426. input:
  427. rep1_peaks="raw_peaks/{sample}_rep1_peaks." + PEAK_TYPE + "Peak",
  428. rep2_peaks="raw_peaks/{sample}_rep2_peaks." + PEAK_TYPE + "Peak"
  429. output:
  430. true_rep_idr="clean_peaks/idr/{sample}_true_rep_idr.txt",
  431. idr_peaks="clean_peaks/idr/{sample}_peaks." + PEAK_TYPE + "Peak"
  432. log: "logs/{sample}_select_peaks_by_idr.log"
  433. singularity:
  434. PEAKSNAKE_HOME + "/sifs/macs3_idr_20231218.sif"
  435. threads: 1
  436. params:
  437. PEAK_TYPE=PEAK_TYPE,
  438. idr_threshold=config["idr"]["idr_threshold"]
  439. shell:
  440. """
  441. idr --samples {input.rep1_peaks} {input.rep2_peaks} --idr-threshold {params.idr_threshold} --output-file {output.true_rep_idr} --plot --input-file-type {PEAK_TYPE}Peak --rank p.value 1>{log} 2>&1
  442. awk -v OFS="\\t" 'BEGIN {{FS=OFS}} {{ $4=$1":"$2"_"$3; print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10 }}' {output.true_rep_idr} | sort -k1,1 -k2,2n -k3,3n > {output.idr_peaks}
  443. """
  444. ##########################################################
  445. # 提取序列
  446. ##########################################################
  447. rule extract_peak_sequence:
  448. input:
  449. Peak="clean_peaks/" + PEAK_SELECTION + "/{sample}_peaks." + PEAK_TYPE + "Peak"
  450. output:
  451. peak_fa="clean_peaks/" + PEAK_SELECTION + "/{sample}_peaks.fa"
  452. params:
  453. top_n=config["motif"]["top_n"],
  454. summit_flank=config["motif"]["summit_flank"],
  455. genome=config["genome"],
  456. summit_fa="clean_peaks/" + PEAK_SELECTION + "/{sample}_summit.fa"
  457. shell:
  458. """
  459. set +o pipefail;
  460. # Sorting and extracting fasta for peak
  461. sort -k8,8nr {input.Peak} | head -n {params.top_n} | bedtools getfasta -fi {params.genome} -bed - > {output.peak_fa}
  462. # Handling narrow peaks
  463. if [[ "{PEAK_TYPE}" == "narrow" ]]; then
  464. sort -k8,8nr {input.Peak} | head -n {params.top_n} | awk -v OFS='\\t' -v flank={params.summit_flank} '{{print $1, $2+$10-flank, $2+$10+flank+1}}' | awk '$2>=0' | bedtools getfasta -fi {params.genome} -bed - > {params.summit_fa}
  465. fi
  466. """
  467. ##########################################################
  468. # Peak 注释
  469. ##########################################################
  470. rule peak_annotation_with_uropa:
  471. input:
  472. Peak="clean_peaks/" + PEAK_SELECTION + "/{sample}_peaks." + PEAK_TYPE + "Peak",
  473. gtf=config["gtf"]
  474. output:
  475. allhits="peak_annotation/{sample}_peaks_allhits.txt",
  476. finalhits="peak_annotation/{sample}_peaks_finalhits.txt"
  477. log: "logs/{sample}_peak_annotation_with_uropa.log"
  478. params:
  479. feature=config["uropa"]["feature"],
  480. feature_anchor=config["uropa"]["feature_anchor"],
  481. distance=config["uropa"]["distance"],
  482. relative_location=config["uropa"]["relative_location"]
  483. singularity:
  484. PEAKSNAKE_HOME + "/sifs/uropa_4.0.3--pyhdfd78af_0.sif"
  485. shell:
  486. """
  487. uropa --bed {input.Peak} --gtf {input.gtf} --feature {params.feature} --feature-anchor {params.feature_anchor} --distance {params.distance} --relative-location {params.relative_location} --show-attributes gene_id --outdir peak_annotation 1>{log} 2>&1
  488. """
  489. ##########################################################
  490. # 生成所有 samples 共识 peaks
  491. ##########################################################
  492. rule merge_peaks:
  493. input:
  494. sample_peaks=expand("clean_peaks/" + PEAK_SELECTION + "/{sample}_peaks." + PEAK_TYPE + "Peak", sample=SAMPLE_TO_REPLICATE.keys())
  495. output:
  496. merged_peaks_bed="clean_peaks/merge/merged_peaks.bed",
  497. merged_peaks_saf="clean_peaks/merge/merged_peaks.saf"
  498. singularity:
  499. PEAKSNAKE_HOME + "/sifs/commonTools_20231218.sif"
  500. params:
  501. fai=config['genome'] + ".fai"
  502. shell:
  503. """
  504. mkdir -p clean_peaks/merge
  505. cat {input.sample_peaks} > clean_peaks/merge/cat_peaks.bed
  506. bedtools sort -i clean_peaks/merge/cat_peaks.bed -g {params.fai} > clean_peaks/merge/cat_sorted_peaks.bed
  507. bedtools merge -i clean_peaks/merge/cat_sorted_peaks.bed > {output.merged_peaks_bed}
  508. awk 'OFS="\t" {{print $1":"$2+1"-"$3, $1, $2+1, $3, "."}}' {output.merged_peaks_bed} > {output.merged_peaks_saf}
  509. """
  510. ##########################################################
  511. # 使用 feature counts 计算 replicate peak counts
  512. ##########################################################
  513. rule run_feature_counts:
  514. input:
  515. bam="clean_bams/{replicate}_final.bam",
  516. merged_peaks_saf="clean_peaks/merge/merged_peaks.saf"
  517. output:
  518. counts="counts/{replicate}.count",
  519. stat="counts/{replicate}.log"
  520. log: "logs/{replicate}_run_feature_counts.log"
  521. params:
  522. peaksnake_home=PEAKSNAKE_HOME,
  523. isPairedEnd="TRUE" if SEQ_TYPE == 'PE' else "FALSE"
  524. shell:
  525. """
  526. Rscript {params.peaksnake_home}/software/RunFeatureCounts/run-featurecounts.R -b {input.bam} --saf {input.merged_peaks_saf} --isPairedEnd {params.isPairedEnd} -o counts/{wildcards.replicate} 1>{log} 2>&1
  527. """
  528. ##########################################################
  529. # 合并生成 replicate counts 矩阵
  530. ##########################################################
  531. rule create_count_matrix:
  532. input:
  533. expand("counts/{replicate}.count", replicate=REPLICATES)
  534. output:
  535. counts_matrix="counts/merged_peaks.counts.matrix",
  536. cpm_matrix="counts/merged_peaks.TMM.CPM.matrix"
  537. log: "logs/create_count_matrix.log"
  538. params:
  539. peaksnake_home=PEAKSNAKE_HOME
  540. shell:
  541. """
  542. ls {input} >counts/count.fofn
  543. perl {params.peaksnake_home}/software/RunFeatureCounts/abundance_estimates_to_matrix.pl --est_method featureCounts --quant_files counts/count.fofn --out_prefix counts/merged_peaks 1>{log} 2>&1
  544. """
  545. ##########################################################
  546. # 使用 DESeq2/ edgeR 进行差异分析
  547. ##########################################################
  548. if config["contrasts"]:
  549. # 生成 samples.txt
  550. with open('samples.txt', 'w') as file:
  551. for ip in REPLICATES:
  552. # 提取 sample name
  553. sample_name = '_'.join(ip.split('_')[:2])
  554. file.write(f"{sample_name}\t{ip}\n")
  555. # 生成 contrasts.txt
  556. with open("contrasts.txt", "w") as file:
  557. file.write("\n".join([" ".join(item.split("_vs_")) for item in config["contrasts"]]))
  558. rule run_deseq2:
  559. input:
  560. "counts/merged_peaks.counts.matrix"
  561. output:
  562. expand("diff_peaks/{contrast}.DESeq2.DE_results", contrast=config["contrasts"])
  563. log: "logs/run_deseq2.log"
  564. singularity:
  565. PEAKSNAKE_HOME + "/sifs/trinity_20231218.sif"
  566. shell:
  567. """
  568. run_DE_analysis.pl -m {input} --method DESeq2 -s samples.txt --contrasts contrasts.txt -o diff_peaks 1>{log} 2>&1
  569. cd diff_peaks/
  570. rm *count_matrix *.MA_n_Volcano.pdf *.Rscript
  571. for file in merged_peaks.counts.matrix.*.DE_results; do
  572. echo -e "PeakID\tsampleA\tsampleB\tFoldChange\tPValue\tPadj" >"${{file#merged_peaks.counts.matrix.}}"
  573. grep -v "^sampleA" $file | awk 'BEGIN {{OFS="\t"}} {{ $7 = exp(log(2) * $7); print $1,$2,$3,$7,$10,$11}}' >> "${{file#merged_peaks.counts.matrix.}}"
  574. rm $file
  575. done
  576. cd ..
  577. """
  578. rule run_edgeR:
  579. input:
  580. "counts/merged_peaks.counts.matrix"
  581. output:
  582. expand("diff_peaks/{contrast}.edgeR.DE_results", contrast=config["contrasts"])
  583. log: "logs/run_edgeR.log"
  584. singularity:
  585. PEAKSNAKE_HOME + "/sifs/trinity_20231218.sif"
  586. shell:
  587. """
  588. run_DE_analysis.pl -m {input} --method edgeR -s samples.txt --contrasts contrasts.txt -o diff_peaks 1>{log} 2>&1
  589. cd diff_peaks/
  590. rm *count_matrix *.MA_n_Volcano.pdf *.Rscript
  591. for file in merged_peaks.counts.matrix.*.DE_results; do
  592. echo -e "PeakID\tsampleA\tsampleB\tFoldChange\tPValue\tPadj" >"${{file#merged_peaks.counts.matrix.}}"
  593. grep -v "^sampleA" $file | awk 'BEGIN {{OFS="\t"}} {{ $4 = exp(log(2) * $4); print $1,$2,$3,$4,$6,$7}}' >> "${{file#merged_peaks.counts.matrix.}}"
  594. rm $file
  595. done
  596. cd ..
  597. """
  598. rule diff_peaks_annotation_with_uropa:
  599. input:
  600. diff_peaks="diff_peaks/{contrast}.edgeR.DE_results",
  601. gtf=config["gtf"]
  602. output:
  603. diff_peaks_bed="diff_peaks_annotation/{contrast}.bed6",
  604. allhits="diff_peaks_annotation/{contrast}_allhits.txt",
  605. finalhits="diff_peaks_annotation/{contrast}_finalhits.txt"
  606. log: "logs/{contrast}_diff_peaks_annotation_with_uropa.log"
  607. params:
  608. diff_peaks_padj=config["diff_peaks"]["padj"],
  609. feature_anchor=config["uropa"]["feature_anchor"],
  610. distance=config["uropa"]["distance"],
  611. relative_location=config["uropa"]["relative_location"]
  612. singularity:
  613. PEAKSNAKE_HOME + "/sifs/uropa_4.0.3--pyhdfd78af_0.sif"
  614. shell:
  615. """
  616. awk 'BEGIN {{FS="\\t"; OFS="\t"}} NR>1 && $6 < {params.diff_peaks_padj} {{split($1, arr, "[:-]"); print arr[1], arr[2]-1, arr[3], $1, $4, "."}}' {input.diff_peaks} > {output.diff_peaks_bed}
  617. uropa --bed {output.diff_peaks_bed} --gtf {input.gtf} --feature gene --feature-anchor {params.feature_anchor} --distance {params.distance} --relative-location {params.relative_location} --outdir diff_peaks_annotation 1>{log} 2>&1
  618. """
  619. ##########################################################
  620. # 合并fastp质控结果生成表格
  621. ##########################################################
  622. rule combine_fastp_reports:
  623. input:
  624. expand("clean_data/{replicate}_fastp.json", replicate=REPLICATES + INPUTS)
  625. output:
  626. "quality_control/fastp_summary.tsv"
  627. log: "logs/combine_fastp_reports.log"
  628. params:
  629. peaksnake_home=PEAKSNAKE_HOME
  630. shell:
  631. """
  632. Rscript {params.peaksnake_home}/scripts/combine_fastp_report.R --input {input} --output quality_control/ 1>{log} 2>&1
  633. """
  634. ##########################################################
  635. # 使用 TOBIAS 进行足迹分析
  636. ##########################################################
  637. rule merge_replicate_bams:
  638. input:
  639. bams=lambda wildcards: expand("clean_bams/{sample}_{rep}_final.bam", sample = wildcards.sample, rep=SAMPLE_TO_REPLICATE[wildcards.sample])
  640. output:
  641. bam="clean_bams/{sample}_merged.bam"
  642. shell:
  643. """
  644. samtools merge -o {output.bam} {input.bams}
  645. samtools index {output.bam}
  646. """
  647. rule ATACorrect:
  648. input:
  649. bam="clean_bams/{sample}_merged.bam",
  650. genome=config["genome"],
  651. merged_peaks_bed="clean_peaks/merge/merged_peaks.bed"
  652. output:
  653. bw="footprinting/{sample}_corrected.bw"
  654. params:
  655. prefix="{sample}"
  656. threads: 8
  657. log: "logs/{sample}_ATACorrect.log"
  658. singularity:
  659. PEAKSNAKE_HOME + "/sifs/tobias_20231231.sif"
  660. shell:
  661. """
  662. TOBIAS ATACorrect --window 10 --bam {input.bam} --genome {input.genome} --peaks {input.merged_peaks_bed} --outdir footprinting --prefix {params.prefix} --cores {threads} 1>{log} 2>&1
  663. """
  664. rule FootprintScores:
  665. input:
  666. bw="footprinting/{sample}_corrected.bw",
  667. merged_peaks_bed="clean_peaks/merge/merged_peaks.bed"
  668. output:
  669. bw="footprinting/{sample}_footprints.bw"
  670. params:
  671. prefix={sample}
  672. threads: 8
  673. log: "logs/{sample}_FootprintScores.log"
  674. singularity:
  675. PEAKSNAKE_HOME + "/sifs/tobias_20231231.sif"
  676. shell:
  677. """
  678. TOBIAS FootprintScores --window 10 --signal {input.bw} --regions {input.merged_peaks_bed} --output {output.bw} --cores {threads} 1>{log} 2>&1
  679. """
  680. rule BINDetect:
  681. input:
  682. bws=expand("footprinting/{sample}_footprints.bw", sample=SAMPLE_TO_REPLICATE.keys()),
  683. merged_peaks_bed="clean_peaks/merge/merged_peaks.bed",
  684. genome=config["genome"]
  685. output:
  686. "footprinting/bindetect_results.txt"
  687. params:
  688. samples=" ".join(SAMPLE_TO_REPLICATE.keys())
  689. threads: 8
  690. log: "logs/BINDetect.log"
  691. singularity:
  692. PEAKSNAKE_HOME + "/sifs/tobias_20231231.sif"
  693. shell:
  694. """
  695. TOBIAS BINDetect --motifs motif_databases/JASPAR/JASPAR2022_CORE_plants_non-redundant_v2.meme --signals {input.bws} --genome {input.genome} --peaks {input.merged_peaks_bed} --outdir footprinting --cond_names {params.samples} --cores {threads}
  696. """
  697. rule plot_per_motif:
  698. input:
  699. motif_bed="footprinting/{motif}/beds/{motif}_all.bed",
  700. bws=expand("footprinting/{sample}_corrected.bw", sample=SAMPLE_TO_REPLICATE.keys())
  701. output:
  702. aggregate_pdf="footprinting/{motif}/{motif}_PlotAggregate.pdf",
  703. heatmap_pdf="footprinting/{motif}/{motif}_PlotHeatmap.pdf"
  704. threads: 1
  705. singularity:
  706. PEAKSNAKE_HOME + "/sifs/tobias_20231231.sif"
  707. shell:
  708. """
  709. TOBIAS PlotAggregate --TFBS {input.motif_bed} --signals {input.bws} --output {output.aggregate_pdf} --share_y both --plot_boundaries --signal-on-x
  710. TOBIAS PlotHeatmap --TFBS {input.motif_bed} --signals {input.bws} --output {output.heatmap_pdf} --sort_by -2
  711. """
  712. ##########################################################
  713. # cross correlation
  714. ##########################################################
  715. rule cross_correlation:
  716. input:
  717. bam="clean_bams/{replicate}_final.bam"
  718. output:
  719. pdf="quality_control/cross_correlation/{replicate}.pdf",
  720. tsv="quality_control/cross_correlation/{replicate}.tsv"
  721. log: "logs/{replicate}_cross_correlation.log"
  722. singularity:
  723. PEAKSNAKE_HOME + "/sifs/phantompeakqualtools_20231218.sif"
  724. params:
  725. spp=config['cross_correlation']['spp'] if config['cross_correlation']['spp'] else ""
  726. threads: 4
  727. shell:
  728. """
  729. run_spp.R -p={threads} -c={input.bam} -savp={output.pdf} -out={output.tsv} -rf {params.spp} 1>{log} 2>&1
  730. """
  731. rule cross_correlation_summary:
  732. input:
  733. expand("quality_control/cross_correlation/{replicate}.tsv", replicate=REPLICATES + INPUTS)
  734. output:
  735. "quality_control/cross_correlation_summary.tsv"
  736. shell:
  737. """
  738. echo -e "Sample\\tTotalReadsFiltered\\tFragmentLength\\tCrossCorrelation\\tPhantomPeakLocation\\tPhantomPeakCorrelation\\tMinimumCrossCorrelationShift\\tMinimumCrossCorrelationValue\\tNSC\\tRSC\\tPhantomPeakQualityTag" >{output}
  739. cat {input} | sort -k 1 >>{output}
  740. """