Snakefile 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797
  1. import os
  2. import yaml
  3. import pandas as pd
  4. configfile: "config.yaml"
  5. ##########################################################
  6. # 全局变量和样本信息
  7. ##########################################################
  8. ## 软件主目录
  9. PEAKSNAKE_HOME = config["PEAKSNAKE_HOME"] if config["PEAKSNAKE_HOME"] else os.getcwd()
  10. ## 关键参数
  11. PEAK_TYPE = config["peak_type"]
  12. SEQ_TYPE = config["seq_type"]
  13. PEAK_SELECTION = config["peak_selection"]
  14. GENOME = config["genome"]
  15. GTF = config["gtf"]
  16. ## 样本信息变量
  17. # 字典:REPLICATE to INPUT
  18. REPLICATE_TO_INPUT = {k: config['samples'][k] for k in sorted(config['samples'])}
  19. # 列表:所有 REPLICATES
  20. REPLICATES = sorted(list(set(REPLICATE_TO_INPUT.keys())))
  21. # 列表:所有 INPUTS
  22. INPUTS = sorted(list(set(REPLICATE_TO_INPUT.values())))
  23. INPUTS = [] if INPUTS == [None] else INPUTS
  24. # 字典: SAMPLE 与 rep1 rep2 rep2 对应关系
  25. SAMPLE_TO_REPLICATE = {}
  26. for s in REPLICATES:
  27. name, rep = '_'.join(s.split('_')[:-1]), s.split('_')[-1]
  28. SAMPLE_TO_REPLICATE.setdefault(name, []).append(rep)
  29. SAMPLE_TO_REPLICATE = {k: sorted(v) for k, v in SAMPLE_TO_REPLICATE.items()}
  30. ## 生成样本信息表
  31. with open("sample_sheet.csv", 'w') as f:
  32. f.write("SampleID,ControlID,Tissue,Factor,Condition,Treatment,Replicate,bamReads,Peaks,bamControl,PeakCaller\n")
  33. for sample, control in REPLICATE_TO_INPUT.items():
  34. sample_parts = sample.split('_')
  35. factor = sample_parts[0] # 提取样本ID中的Factor
  36. tissue = "NA"
  37. treatment = sample_parts[1] # 在这个例子中,tissue和condition是相同的
  38. condition = factor + "_" + treatment
  39. replicate = sample_parts[2].replace("rep", "") # 将"rep1"和"rep2"转换为"1"和"2"
  40. if control:
  41. control_parts = control.split('_')
  42. control_id = "_".join(control_parts[:2]) # 构建 ControlID
  43. bamControl = f"clean_bams/{control}_final.bam"
  44. else:
  45. control_id = "NA"
  46. bamControl = "NA"
  47. bamReads = f"clean_bams/{sample}_final.bam"
  48. peaks = f"clean_peaks/cutoff/{sample}_peaks.{PEAK_TYPE}Peak"
  49. f.write(f"{sample},{control_id},{tissue},{factor},{condition},{treatment},{replicate},{bamReads},{peaks},{bamControl},bed\n")
  50. ##########################################################
  51. # rule all: 最终想要生成的文件
  52. ##########################################################
  53. def get_motifs_from_meme(meme_file):
  54. motifs = []
  55. with open(meme_file, 'r') as file:
  56. for line in file:
  57. if line.startswith("MOTIF"):
  58. parts = line.split()
  59. # 检查是否有三列,且第三列不是空字符串
  60. if len(parts) == 3 and parts[2]:
  61. motif_name = parts[2] + "_" + parts[1] # 第三列和第二列的组合
  62. # 检查是否有两列或第三列为空
  63. elif len(parts) >= 2:
  64. motif_name = "_" + parts[1] # 只使用第二列
  65. motifs.append(motif_name)
  66. return motifs
  67. rule all:
  68. input:
  69. #####################################
  70. # 从 fastq 到 peaks
  71. #####################################
  72. # 测序数据质控表格
  73. "quality_control/fastp_summary.tsv",
  74. # 最终比对结果
  75. expand("clean_bams/{replicate}_final.bam", replicate = REPLICATES + INPUTS),
  76. # 比对质控表格
  77. "quality_control/estimate_read_filtering.tsv",
  78. # cross correlation 质控表格
  79. "quality_control/cross_correlation_summary.tsv" if config["cross_correlation"]["do"] else [],
  80. # bw 文件, deeptools 可视化
  81. expand("clean_bams/{replicate}.bw", replicate = REPLICATES + INPUTS),
  82. "deeptools/sample_correlation.pdf" if len(REPLICATES) > 2 else [],
  83. "deeptools/tss_heatmap.pdf",
  84. "deeptools/tss_tes_heatmap.pdf",
  85. # raw peak 结果
  86. expand("raw_peaks/{replicate}_peaks.{PEAK_TYPE}Peak", replicate = REPLICATES, PEAK_TYPE = PEAK_TYPE),
  87. # cutoff analysis
  88. expand("raw_peaks/{replicate}_cutoff_analysis.pdf", replicate = REPLICATES),
  89. # clean peak 结果
  90. expand("clean_peaks/cutoff/{replicate}_peaks.{PEAK_TYPE}Peak", replicate = REPLICATES, PEAK_TYPE = PEAK_TYPE),
  91. # 通过 intersect or idr 进行 peak 筛选
  92. expand("clean_peaks/{m}/{sample}_peaks.{PEAK_TYPE}Peak", sample=SAMPLE_TO_REPLICATE.keys(), m = PEAK_SELECTION, PEAK_TYPE = PEAK_TYPE),
  93. # 提取序列
  94. expand("clean_peaks/{m}/{sample}_peaks.fa", sample=SAMPLE_TO_REPLICATE.keys(), m = PEAK_SELECTION),
  95. # 所有样本共识peaks
  96. "clean_peaks/merge/merged_peaks.bed",
  97. #####################################
  98. # 定量分析
  99. #####################################
  100. "counts/merged_peaks.counts.matrix",
  101. "counts/merged_peaks.TMM.CPM.matrix",
  102. #####################################
  103. # Peak 注释
  104. #####################################
  105. # 对单个样本 peaks 注释
  106. expand("peak_annotation/{sample}_peaks_{hit}.txt", sample = SAMPLE_TO_REPLICATE.keys(), hit = ["allhits", "finalhits"]) if config["uropa"]["do"] else [],
  107. # 对 merge 之后的 peaks 注释
  108. expand("peak_annotation/merged_peaks_{hit}.txt", hit = ["allhits", "finalhits"]) if config["uropa"]["do"] else [],
  109. #####################################
  110. # 差异分析
  111. #####################################
  112. expand("diff_peaks/{contrast}.{m}.DE_results", contrast=config["contrasts"], m=config["diff_peaks"]["method"]) if config["diff_peaks"]["do"] and config["contrasts"] else [],
  113. #####################################
  114. # 足迹分析
  115. #####################################
  116. "footprinting/bindetect_results.txt" if config["footprint"]["do"] else []
  117. ##########################################################
  118. # 使用fastp进行原始数据质控和过滤,自动判断 SE?PE?
  119. # raw_data => clean_data
  120. ##########################################################
  121. rule fastp_quality_control:
  122. input:
  123. fastq=["raw_data/{replicate}_R1.fastq.gz", "raw_data/{replicate}_R2.fastq.gz"] if SEQ_TYPE == 'PE' else ["raw_data/{replicate}_R1.fastq.gz"]
  124. output:
  125. fastq=["clean_data/{replicate}_R1.fastq.gz", "clean_data/{replicate}_R2.fastq.gz"] if SEQ_TYPE == 'PE' else ["clean_data/{replicate}_R1.fastq.gz"],
  126. html_report="clean_data/{replicate}_fastp.html",
  127. json_report="clean_data/{replicate}_fastp.json"
  128. log: "logs/{replicate}_fastp_quality_control.log"
  129. threads: 3
  130. singularity:
  131. PEAKSNAKE_HOME + "/sifs/commonTools_20231218.sif"
  132. params:
  133. fastq="-i raw_data/{replicate}_R1.fastq.gz -o clean_data/{replicate}_R1.fastq.gz -I raw_data/{replicate}_R2.fastq.gz -O clean_data/{replicate}_R2.fastq.gz" if SEQ_TYPE == 'PE' else "-i raw_data/{replicate}_R1.fastq.gz -o clean_data/{replicate}_R1.fastq.gz",
  134. fastp=config["fastp"]
  135. shell:
  136. """
  137. fastp {params.fastq} --html {output.html_report} --json {output.json_report} --thread {threads} {params.fastp} 1>{log} 2>&1
  138. """
  139. ##########################################################
  140. # 根据基因组构建 bowtie2 index
  141. ##########################################################
  142. rule bowtie2_build:
  143. input:
  144. genome = GENOME
  145. output:
  146. index_prefix = "ref/genome.1.bt2"
  147. log: "logs/bowtie2_build.log"
  148. singularity:
  149. PEAKSNAKE_HOME + "/sifs/bowtie2_2.5.2.sif"
  150. threads: 6
  151. shell:
  152. "bowtie2-build --threads {threads} {input.genome} ref/genome 1>{log} 2>&1"
  153. ##########################################################
  154. # 使用 bowtie2 将测序数据比对到参考基因组
  155. # fastq => sam
  156. ##########################################################
  157. rule bowtie2_align:
  158. input:
  159. fastq=["clean_data/{replicate}_R1.fastq.gz", "clean_data/{replicate}_R2.fastq.gz"] if SEQ_TYPE == 'PE' else ["clean_data/{replicate}_R1.fastq.gz"],
  160. genome_index=config["bowtie2_index"] + ".1.bt2" if config["bowtie2_index"] else "ref/genome.1.bt2"
  161. output:
  162. sam="raw_bams/{replicate}.sam"
  163. log: "logs/{replicate}_bowtie2_align.log"
  164. singularity:
  165. PEAKSNAKE_HOME + "/sifs/bowtie2_2.5.2.sif"
  166. threads: 4
  167. params:
  168. genome_index=config["bowtie2_index"] if config["bowtie2_index"] else "ref/genome",
  169. fastq="-1 clean_data/{replicate}_R1.fastq.gz -2 clean_data/{replicate}_R2.fastq.gz" if SEQ_TYPE == 'PE' else "-U clean_data/{replicate}_R1.fastq.gz",
  170. bowtie2=config["bowtie2"] if config["bowtie2"] else ""
  171. shell:
  172. """
  173. bowtie2 -p {threads} -x {params.genome_index} {params.fastq} -S {output.sam} {params.bowtie2} 1>{log} 2>&1
  174. """
  175. rule sort_bam:
  176. input:
  177. sam="raw_bams/{replicate}.sam"
  178. output:
  179. sorted_bam="raw_bams/{replicate}_sorted.bam"
  180. log: "logs/{replicate}_sort_bam.log"
  181. singularity:
  182. PEAKSNAKE_HOME + "/sifs/commonTools_20231218.sif"
  183. threads: 4
  184. shell:
  185. """
  186. samtools sort -@ {threads} -o {output.sorted_bam} {input.sam} 1>{log} 2>&1
  187. samtools index {output.sorted_bam}
  188. """
  189. rule sieve_alignment:
  190. input:
  191. bam="raw_bams/{replicate}_sorted.bam"
  192. output:
  193. bam="clean_bams/{replicate}_final.bam"
  194. log: "logs/{replicate}_sieve_alignment.log"
  195. threads: 4
  196. singularity:
  197. PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif"
  198. params:
  199. peaksnake_home=PEAKSNAKE_HOME,
  200. minMappingQuality=config["alignmentSieve"]["minMappingQuality"],
  201. blacklist="--blackListFileName " + config["alignmentSieve"]["blacklist"] if config["alignmentSieve"]["blacklist"] else [],
  202. extra=config["alignmentSieve"]["extra"] if config["alignmentSieve"]["extra"] else ""
  203. shell:
  204. """
  205. alignmentSieve --numberOfProcessors {threads} --bam {input.bam} --outFile {output.bam} --filterMetrics {log} --ignoreDuplicates --minMappingQuality {params.minMappingQuality} --samFlagExclude 260 {params.blacklist} {params.extra} 1>{log} 2>&1
  206. samtools index {output.bam}
  207. """
  208. rule estimate_read_filtering:
  209. input:
  210. bams=expand("raw_bams/{replicate}_sorted.bam", replicate=REPLICATES + INPUTS)
  211. output:
  212. stat="quality_control/estimate_read_filtering.tsv"
  213. threads: 4
  214. singularity:
  215. PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif"
  216. log: "logs/estimate_read_filtering.log"
  217. params:
  218. peaksnake_home=PEAKSNAKE_HOME,
  219. minMappingQuality=config["alignmentSieve"]["minMappingQuality"],
  220. blacklist="--blackListFileName " + config["alignmentSieve"]["blacklist"] if config["alignmentSieve"]["blacklist"] else "",
  221. extra=config["alignmentSieve"]["extra"] if config["alignmentSieve"]["extra"] else "",
  222. sampleLabels=REPLICATES + INPUTS
  223. shell:
  224. """
  225. estimateReadFiltering \
  226. --numberOfProcessors {threads} \
  227. --bam {input.bams} \
  228. --sampleLabels {params.sampleLabels} \
  229. --outFile {output.stat} \
  230. --ignoreDuplicates \
  231. --minMappingQuality {params.minMappingQuality} \
  232. --samFlagExclude 260 \
  233. {params.blacklist} \
  234. {params.extra} 1>{log} 2>&1
  235. """
  236. ##########################################################
  237. # DeepTools 绘图
  238. ##########################################################
  239. rule convert_bam_to_bigwig:
  240. input:
  241. bam="clean_bams/{replicate}_final.bam"
  242. output:
  243. bw="clean_bams/{replicate}.bw"
  244. log: "logs/{replicate}_convert_bam_to_bigwig.log"
  245. singularity:
  246. PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif"
  247. threads: 2
  248. params:
  249. gsize=config["gsize"],
  250. bamCoverage = config["bamCoverage"] if config["bamCoverage"] else ""
  251. shell:
  252. """
  253. bamCoverage -p {threads} --bam {input.bam} -o {output.bw} --effectiveGenomeSize {params.gsize} {params.bamCoverage} 1>{log} 2>&1
  254. """
  255. rule summarize_multiple_bigwigs:
  256. input:
  257. bws=expand("clean_bams/{replicate}.bw", replicate=REPLICATES + INPUTS),
  258. tss_tes_bed=config["tss_tes_bed"]
  259. output:
  260. "clean_bams/bw_summary.gz"
  261. params:
  262. labels=REPLICATES + INPUTS
  263. log: "logs/summarize_multiple_bigwigs.log"
  264. singularity:
  265. PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif"
  266. threads: 10
  267. shell:
  268. """
  269. multiBigwigSummary BED-file \
  270. --bwfiles {input.bws} \
  271. --labels {params.labels} \
  272. --BED {input.tss_tes_bed} \
  273. -o {output} -p {threads} 1>{log} 2>&1
  274. """
  275. rule generate_correlation_plots:
  276. input:
  277. "clean_bams/bw_summary.gz"
  278. output:
  279. pdf="deeptools/sample_correlation.pdf",
  280. tab="deeptools/sample_correlation.tab"
  281. log: "logs/generate_correlation_plots.log"
  282. singularity:
  283. PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif"
  284. shell:
  285. """
  286. plotCorrelation -in {input} -o {output.pdf} \
  287. --corMethod pearson --whatToPlot heatmap \
  288. -min 0.5 \
  289. --plotTitle "Pearson Correlation of Samples" \
  290. --outFileCorMatrix {output.tab} 1>{log} 2>&1
  291. """
  292. rule plot_heatmap_reference_point:
  293. input:
  294. bws=expand("clean_bams/{replicate}.bw", replicate=REPLICATES + INPUTS),
  295. tss_tes_bed=config["tss_tes_bed"],
  296. tss_tes_shuffle_bed=config["tss_tes_shuffle_bed"]
  297. output:
  298. tss_matrix="deeptools/tss_matrix.gz",
  299. tss_heatmap="deeptools/tss_heatmap.pdf"
  300. params:
  301. labels=REPLICATES + INPUTS
  302. log: "logs/plot_heatmap_reference_point.log"
  303. singularity:
  304. PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif"
  305. threads: 10
  306. shell:
  307. """
  308. computeMatrix reference-point \
  309. -S {input.bws} \
  310. --samplesLabel {params.labels} \
  311. -R {input.tss_tes_bed} {input.tss_tes_shuffle_bed} \
  312. --referencePoint TSS \
  313. -b 5000 -a 5000 \
  314. --binSize 50 \
  315. -o {output.tss_matrix} \
  316. -p {threads} 1>{log} 2>&1
  317. plotHeatmap -m {output.tss_matrix} -o {output.tss_heatmap} --missingDataColor 0.5 1>>{log} 2>&1
  318. """
  319. rule plot_heatmap_scale_regions:
  320. input:
  321. bws=expand("clean_bams/{replicate}.bw", replicate=REPLICATES + INPUTS),
  322. tss_tes_bed=config["tss_tes_bed"],
  323. tss_tes_shuffle_bed=config["tss_tes_shuffle_bed"]
  324. output:
  325. tss_tes_matrix="deeptools/tss_tes_matrix.gz",
  326. tss_tes_heatmap="deeptools/tss_tes_heatmap.pdf"
  327. params:
  328. labels=REPLICATES + INPUTS
  329. log: "logs/plot_heatmap_scale_regions.log"
  330. singularity:
  331. PEAKSNAKE_HOME + "/sifs/deeptools_20231220.sif"
  332. threads: 10
  333. shell:
  334. """
  335. computeMatrix scale-regions \
  336. -S {input.bws} \
  337. --samplesLabel {params.labels} \
  338. -R {input.tss_tes_bed} {input.tss_tes_shuffle_bed} \
  339. --regionBodyLength 4000 \
  340. -b 2000 -a 2000 \
  341. --binSize 50 \
  342. -o {output.tss_tes_matrix} \
  343. -p {threads} 1>{log} 2>&1
  344. plotHeatmap -m {output.tss_tes_matrix} -o {output.tss_tes_heatmap} --missingDataColor 0.5 1>>{log} 2>&1
  345. """
  346. ##########################################################
  347. # 对每个 replicate:input call peak
  348. ##########################################################
  349. # 规则:MACS3 call peak
  350. rule callpeak_with_macs3:
  351. input:
  352. sorted_ip_bam="clean_bams/{replicate}_final.bam",
  353. sorted_input_bam=lambda wildcards: f"clean_bams/{config['samples'][wildcards.replicate]}_final.bam" if config['samples'][wildcards.replicate] else []
  354. output:
  355. Peak="raw_peaks/{replicate}_peaks." + PEAK_TYPE + "Peak",
  356. cutoff_analysis_txt="raw_peaks/{replicate}_cutoff_analysis.txt"
  357. log: "logs/{replicate}_callpeak_with_macs3.log"
  358. singularity:
  359. PEAKSNAKE_HOME + "/sifs/macs3_idr_20231218.sif"
  360. threads: 1
  361. params:
  362. control=lambda wildcards: f"-c clean_bams/{config['samples'][wildcards.replicate]}_final.bam" if config['samples'][wildcards.replicate] else "",
  363. format="BAMPE" if SEQ_TYPE == 'PE' and '--nomodel' not in config["macs3"]["extra"] else "BAM",
  364. gsize=config["gsize"],
  365. PEAK_TYPE="--broad" if PEAK_TYPE == "broad" else "",
  366. extra = config["macs3"]["extra"] if config["macs3"]["extra"] else ""
  367. shell:
  368. """
  369. macs3 callpeak -g {params.gsize} -t {input.sorted_ip_bam} {params.control} --name raw_peaks/{wildcards.replicate} --format {params.format} --keep-dup all --qvalue 0.075 --cutoff-analysis {params.PEAK_TYPE} {params.extra} 1>{log} 2>&1
  370. """
  371. rule plot_macs_cutoff_analysis:
  372. input:
  373. cutoff_analysis_txt="raw_peaks/{replicate}_cutoff_analysis.txt"
  374. output:
  375. cutoff_analysis_pdf="raw_peaks/{replicate}_cutoff_analysis.pdf"
  376. log: "logs/{replicate}_plot_macs_cutoff_analysis.log"
  377. shell:
  378. """
  379. python {PEAKSNAKE_HOME}/scripts/plot_macs_cutoff_analysis.py {input.cutoff_analysis_txt} {output} 1>{log} 2>&1
  380. """
  381. rule filter_peaks_by_qscore:
  382. input:
  383. Peak="raw_peaks/{replicate}_peaks." + PEAK_TYPE + "Peak"
  384. output:
  385. Peak="clean_peaks/cutoff/{replicate}_peaks." + PEAK_TYPE + "Peak"
  386. params:
  387. qscore=config["macs3"]["qscore"]
  388. shell:
  389. """
  390. awk '$9>{params.qscore}' {input.Peak} > {output.Peak}
  391. """
  392. rule select_peaks_norep:
  393. input:
  394. Peak="clean_peaks/cutoff/{sample}_rep1_peaks." + PEAK_TYPE + "Peak"
  395. output:
  396. Peak="clean_peaks/norep/{sample}_peaks." + PEAK_TYPE + "Peak"
  397. shell:
  398. """
  399. cp {input.Peak} {output.Peak}
  400. """
  401. rule select_peaks_by_intersect:
  402. input:
  403. Peak=lambda wildcards: expand("clean_peaks/cutoff/" + wildcards.sample + "_{rep}_peaks." + PEAK_TYPE + "Peak", rep=SAMPLE_TO_REPLICATE[wildcards.sample])
  404. output:
  405. Peak="clean_peaks/intersect/{sample}_peaks." + PEAK_TYPE + "Peak"
  406. params:
  407. min_overlap=config['intersect']['min_overlap']
  408. shell:
  409. """
  410. # 检查输入文件的数量
  411. num_peaks=$(echo {input.Peak} | wc -w)
  412. # 如果只有一个输入文件
  413. if [ "$num_peaks" -eq 1 ]; then
  414. cp {input.Peak[0]} {output.Peak}
  415. else
  416. # 复制第一个输入文件到临时文件
  417. cp {input.Peak[0]} {wildcards.sample}.temp.bed
  418. # 使用除第一个之外的所有输入文件
  419. echo {input.Peak} | tr ' ' '\\n' | awk 'NR>1' | while read bed; do
  420. bedtools intersect -f {params.min_overlap} -r -a {wildcards.sample}.temp.bed -b $bed -wa > {wildcards.sample}.temp2.bed
  421. mv {wildcards.sample}.temp2.bed {wildcards.sample}.temp.bed
  422. done
  423. # 创建一个中间的 peak list 文件
  424. cut -f 4 {wildcards.sample}.temp.bed > {wildcards.sample}.peak_lst
  425. # 使用中间的 peak list 文件和第一个输入文件生成最终输出
  426. awk 'NR==FNR {{a[$1]; next}} $4 in a' {wildcards.sample}.peak_lst {input.Peak[0]} > {output.Peak}
  427. # 清理临时文件
  428. rm {wildcards.sample}.temp.bed {wildcards.sample}.peak_lst
  429. fi
  430. """
  431. rule select_peaks_by_idr:
  432. input:
  433. rep1_peaks="raw_peaks/{sample}_rep1_peaks." + PEAK_TYPE + "Peak",
  434. rep2_peaks="raw_peaks/{sample}_rep2_peaks." + PEAK_TYPE + "Peak"
  435. output:
  436. true_rep_idr="clean_peaks/idr/{sample}_true_rep_idr.txt",
  437. idr_peaks="clean_peaks/idr/{sample}_peaks." + PEAK_TYPE + "Peak"
  438. log: "logs/{sample}_select_peaks_by_idr.log"
  439. singularity:
  440. PEAKSNAKE_HOME + "/sifs/macs3_idr_20231218.sif"
  441. threads: 1
  442. params:
  443. PEAK_TYPE=PEAK_TYPE,
  444. idr_threshold=config["idr"]["idr_threshold"]
  445. shell:
  446. """
  447. idr --samples {input.rep1_peaks} {input.rep2_peaks} --idr-threshold {params.idr_threshold} --output-file {output.true_rep_idr} --plot --input-file-type {PEAK_TYPE}Peak --rank p.value 1>{log} 2>&1
  448. awk -v OFS="\\t" 'BEGIN {{FS=OFS}} {{ $4=$1":"$2"_"$3; print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10 }}' {output.true_rep_idr} | sort -k1,1 -k2,2n -k3,3n > {output.idr_peaks}
  449. """
  450. ##########################################################
  451. # 提取序列
  452. ##########################################################
  453. rule extract_peak_sequence:
  454. input:
  455. Peak="clean_peaks/{m}/{sample}_peaks." + PEAK_TYPE + "Peak"
  456. output:
  457. peak_fa="clean_peaks/{m}/{sample}_peaks.fa"
  458. params:
  459. top_n=config["motif"]["top_n"],
  460. summit_flank=config["motif"]["summit_flank"],
  461. genome=config["genome"],
  462. summit_fa="clean_peaks/{m}/{sample}_summit.fa"
  463. shell:
  464. """
  465. set +o pipefail;
  466. # Sorting and extracting fasta for peak
  467. sort -k8,8nr {input.Peak} | head -n {params.top_n} | bedtools getfasta -fi {params.genome} -bed - > {output.peak_fa}
  468. # Handling narrow peaks
  469. if [[ "{PEAK_TYPE}" == "narrow" ]]; then
  470. sort -k8,8nr {input.Peak} | head -n {params.top_n} | awk -v OFS='\\t' -v flank={params.summit_flank} '{{print $1, $2+$10-flank, $2+$10+flank+1}}' | awk '$2>=0' | bedtools getfasta -fi {params.genome} -bed - > {params.summit_fa}
  471. fi
  472. """
  473. ##########################################################
  474. # Peak 注释
  475. ##########################################################
  476. rule peak_annotation_with_uropa:
  477. input:
  478. Peak="clean_peaks/" + PEAK_SELECTION + "/{sample}_peaks." + PEAK_TYPE + "Peak",
  479. gtf=config["gtf"]
  480. output:
  481. allhits="peak_annotation/{sample}_peaks_allhits.txt",
  482. finalhits="peak_annotation/{sample}_peaks_finalhits.txt"
  483. log: "logs/{sample}_peak_annotation_with_uropa.log"
  484. params:
  485. feature=config["uropa"]["feature"],
  486. feature_anchor=config["uropa"]["feature_anchor"],
  487. distance=config["uropa"]["distance"],
  488. relative_location=config["uropa"]["relative_location"]
  489. singularity:
  490. PEAKSNAKE_HOME + "/sifs/uropa_4.0.3--pyhdfd78af_0.sif"
  491. shell:
  492. """
  493. uropa --bed {input.Peak} --gtf {input.gtf} --feature {params.feature} --feature-anchor {params.feature_anchor} --distance {params.distance} --relative-location {params.relative_location} --show-attributes gene_id --outdir peak_annotation 1>{log} 2>&1
  494. """
  495. ##########################################################
  496. # 生成所有 samples 共识 peaks
  497. ##########################################################
  498. rule merge_peaks:
  499. input:
  500. sample_peaks=expand("clean_peaks/" + PEAK_SELECTION + "/{sample}_peaks." + PEAK_TYPE + "Peak", sample=SAMPLE_TO_REPLICATE.keys())
  501. output:
  502. merged_peaks_bed="clean_peaks/merge/merged_peaks.bed",
  503. merged_peaks_saf="clean_peaks/merge/merged_peaks.saf"
  504. singularity:
  505. PEAKSNAKE_HOME + "/sifs/commonTools_20231218.sif"
  506. params:
  507. fai=config['genome'] + ".fai"
  508. shell:
  509. """
  510. mkdir -p clean_peaks/merge
  511. cat {input.sample_peaks} > clean_peaks/merge/cat_peaks.bed
  512. bedtools sort -i clean_peaks/merge/cat_peaks.bed -g {params.fai} > clean_peaks/merge/cat_sorted_peaks.bed
  513. bedtools merge -i clean_peaks/merge/cat_sorted_peaks.bed > {output.merged_peaks_bed}
  514. awk 'OFS="\t" {{print $1":"$2"-"$3, $1, $2+1, $3, "."}}' {output.merged_peaks_bed} > {output.merged_peaks_saf}
  515. """
  516. rule merged_peak_annotation_with_uropa:
  517. input:
  518. Peak="clean_peaks/merge/merged_peaks.bed",
  519. gtf=config["gtf"]
  520. output:
  521. allhits="peak_annotation/merged_peaks_allhits.txt",
  522. finalhits="peak_annotation/merged_peaks_finalhits.txt"
  523. log: "logs/merged_peak_annotation_with_uropa.log"
  524. params:
  525. feature=config["uropa"]["feature"],
  526. feature_anchor=config["uropa"]["feature_anchor"],
  527. distance=config["uropa"]["distance"],
  528. relative_location=config["uropa"]["relative_location"]
  529. singularity:
  530. PEAKSNAKE_HOME + "/sifs/uropa_4.0.3--pyhdfd78af_0.sif"
  531. shell:
  532. """
  533. uropa --bed {input.Peak} --gtf {input.gtf} --feature {params.feature} --feature-anchor {params.feature_anchor} --distance {params.distance} --relative-location {params.relative_location} --show-attributes gene_id --outdir peak_annotation 1>{log} 2>&1
  534. """
  535. ##########################################################
  536. # 使用 feature counts 计算 replicate peak counts
  537. ##########################################################
  538. rule run_feature_counts:
  539. input:
  540. bam="clean_bams/{replicate}_final.bam",
  541. merged_peaks_saf="clean_peaks/merge/merged_peaks.saf"
  542. output:
  543. counts="counts/{replicate}.count",
  544. stat="counts/{replicate}.log"
  545. log: "logs/{replicate}_run_feature_counts.log"
  546. params:
  547. peaksnake_home=PEAKSNAKE_HOME,
  548. isPairedEnd="TRUE" if SEQ_TYPE == 'PE' else "FALSE"
  549. shell:
  550. """
  551. Rscript {params.peaksnake_home}/software/RunFeatureCounts/run-featurecounts.R -b {input.bam} --saf {input.merged_peaks_saf} --isPairedEnd {params.isPairedEnd} -o counts/{wildcards.replicate} 1>{log} 2>&1
  552. """
  553. ##########################################################
  554. # 合并生成 replicate counts 矩阵
  555. ##########################################################
  556. rule create_count_matrix:
  557. input:
  558. expand("counts/{replicate}.count", replicate=REPLICATES)
  559. output:
  560. counts_matrix="counts/merged_peaks.counts.matrix",
  561. cpm_matrix="counts/merged_peaks.TMM.CPM.matrix"
  562. log: "logs/create_count_matrix.log"
  563. params:
  564. peaksnake_home=PEAKSNAKE_HOME
  565. shell:
  566. """
  567. ls {input} >counts/count.fofn
  568. perl {params.peaksnake_home}/software/RunFeatureCounts/abundance_estimates_to_matrix.pl --est_method featureCounts --quant_files counts/count.fofn --out_prefix counts/merged_peaks 1>{log} 2>&1
  569. """
  570. ##########################################################
  571. # 使用 DESeq2/ edgeR 进行差异分析
  572. ##########################################################
  573. if config["contrasts"]:
  574. # 生成 samples.txt
  575. with open('samples.txt', 'w') as file:
  576. for ip in REPLICATES:
  577. # 提取 sample name
  578. sample_name = '_'.join(ip.split('_')[:2])
  579. file.write(f"{sample_name}\t{ip}\n")
  580. # 生成 contrasts.txt
  581. with open("contrasts.txt", "w") as file:
  582. file.write("\n".join([" ".join(item.split("_vs_")) for item in config["contrasts"]]))
  583. rule run_deseq2:
  584. input:
  585. "counts/merged_peaks.counts.matrix"
  586. output:
  587. expand("diff_peaks/{contrast}.DESeq2.DE_results", contrast=config["contrasts"])
  588. log: "logs/run_deseq2.log"
  589. params:
  590. extra=config["diff_peaks"]["extra"]
  591. singularity:
  592. PEAKSNAKE_HOME + "/sifs/trinity_20231218.sif"
  593. shell:
  594. """
  595. run_DE_analysis.pl -m {input} --method DESeq2 -s samples.txt --contrasts contrasts.txt -o diff_peaks {params.extra} 1>{log} 2>&1
  596. cd diff_peaks/
  597. rm *count_matrix *.MA_n_Volcano.pdf *.Rscript
  598. for file in merged_peaks.counts.matrix.*.DE_results; do
  599. echo -e "PeakID\tsampleA\tsampleB\tlog2FoldChange\tPValue\tPadj" >"${{file#merged_peaks.counts.matrix.}}"
  600. grep -v "^sampleA" $file | awk 'BEGIN {{OFS="\t"}} {{print $1,$2,$3,$7,$10,$11}}' >> "${{file#merged_peaks.counts.matrix.}}"
  601. rm $file
  602. done
  603. cd ..
  604. """
  605. rule run_edgeR:
  606. input:
  607. "counts/merged_peaks.counts.matrix"
  608. output:
  609. expand("diff_peaks/{contrast}.edgeR.DE_results", contrast=config["contrasts"])
  610. log: "logs/run_edgeR.log"
  611. params:
  612. extra=config["diff_peaks"]["extra"]
  613. singularity:
  614. PEAKSNAKE_HOME + "/sifs/trinity_20231218.sif"
  615. shell:
  616. """
  617. run_DE_analysis.pl -m {input} --method edgeR -s samples.txt --contrasts contrasts.txt -o diff_peaks {params.extra} 1>{log} 2>&1
  618. cd diff_peaks/
  619. rm *count_matrix *.MA_n_Volcano.pdf *.Rscript
  620. for file in merged_peaks.counts.matrix.*.DE_results; do
  621. echo -e "PeakID\tsampleA\tsampleB\tlog2FoldChange\tPValue\tPadj" >"${{file#merged_peaks.counts.matrix.}}"
  622. grep -v "^sampleA" $file | awk 'BEGIN {{OFS="\t"}} {{print $1,$2,$3,$4,$6,$7}}' >> "${{file#merged_peaks.counts.matrix.}}"
  623. rm $file
  624. done
  625. cd ..
  626. """
  627. ##########################################################
  628. # 合并fastp质控结果生成表格
  629. ##########################################################
  630. rule combine_fastp_reports:
  631. input:
  632. expand("clean_data/{replicate}_fastp.json", replicate=REPLICATES + INPUTS)
  633. output:
  634. "quality_control/fastp_summary.tsv"
  635. log: "logs/combine_fastp_reports.log"
  636. params:
  637. peaksnake_home=PEAKSNAKE_HOME
  638. shell:
  639. """
  640. Rscript {params.peaksnake_home}/scripts/combine_fastp_report.R --input {input} --output quality_control/ 1>{log} 2>&1
  641. """
  642. ##########################################################
  643. # 使用 TOBIAS 进行足迹分析
  644. ##########################################################
  645. rule merge_replicate_bams:
  646. input:
  647. bams=lambda wildcards: expand("clean_bams/{sample}_{rep}_final.bam", sample = wildcards.sample, rep=SAMPLE_TO_REPLICATE[wildcards.sample])
  648. output:
  649. bam="clean_bams/{sample}_merged.bam"
  650. shell:
  651. """
  652. samtools merge -o {output.bam} {input.bams}
  653. samtools index {output.bam}
  654. """
  655. rule ATACorrect:
  656. input:
  657. bam="clean_bams/{sample}_merged.bam",
  658. genome=config["genome"],
  659. merged_peaks_bed="clean_peaks/merge/merged_peaks.bed"
  660. output:
  661. bw="footprinting/{sample}_corrected.bw"
  662. params:
  663. prefix="{sample}"
  664. threads: 8
  665. log: "logs/{sample}_ATACorrect.log"
  666. singularity:
  667. PEAKSNAKE_HOME + "/sifs/tobias_20231231.sif"
  668. shell:
  669. """
  670. TOBIAS ATACorrect --bam {input.bam} --genome {input.genome} --peaks {input.merged_peaks_bed} --outdir footprinting --prefix {params.prefix} --cores {threads} 1>{log} 2>&1
  671. """
  672. rule FootprintScores:
  673. input:
  674. bw="footprinting/{sample}_corrected.bw",
  675. merged_peaks_bed="clean_peaks/merge/merged_peaks.bed"
  676. output:
  677. bw="footprinting/{sample}_footprints.bw"
  678. params:
  679. prefix={sample}
  680. threads: 8
  681. log: "logs/{sample}_FootprintScores.log"
  682. singularity:
  683. PEAKSNAKE_HOME + "/sifs/tobias_20231231.sif"
  684. shell:
  685. """
  686. TOBIAS ScoreBigwig --signal {input.bw} --regions {input.merged_peaks_bed} --output {output.bw} --cores {threads} 1>{log} 2>&1
  687. """
  688. rule BINDetect:
  689. input:
  690. bws=expand("footprinting/{sample}_footprints.bw", sample=SAMPLE_TO_REPLICATE.keys()),
  691. merged_peaks_bed="clean_peaks/merge/merged_peaks.bed",
  692. genome=config["genome"]
  693. output:
  694. "footprinting/bindetect_results.txt",
  695. expand("footprinting/{motif}/beds/{motif}_all.bed", motif=get_motifs_from_meme(config["footprint"]["motif"])),
  696. expand("footprinting/{motif}/beds/{motif}_{sample}_{b}.bed", motif=get_motifs_from_meme(config["footprint"]["motif"]), sample=SAMPLE_TO_REPLICATE.keys(), b=["bound", "unbound"])
  697. params:
  698. samples=" ".join(SAMPLE_TO_REPLICATE.keys()),
  699. motif_databases=config["footprint"]["motif"]
  700. threads: 8
  701. log: "logs/BINDetect.log"
  702. singularity:
  703. PEAKSNAKE_HOME + "/sifs/tobias_20231231.sif"
  704. shell:
  705. """
  706. TOBIAS BINDetect --motifs {params.motif_databases} --signals {input.bws} --genome {input.genome} --peaks {input.merged_peaks_bed} --outdir footprinting --cond_names {params.samples} --cores {threads} 1>{log} 2>&1
  707. """
  708. ##########################################################
  709. # cross correlation
  710. ##########################################################
  711. rule cross_correlation:
  712. input:
  713. bam="clean_bams/{replicate}_final.bam"
  714. output:
  715. pdf="quality_control/cross_correlation/{replicate}.pdf",
  716. tsv="quality_control/cross_correlation/{replicate}.tsv"
  717. log: "logs/{replicate}_cross_correlation.log"
  718. singularity:
  719. PEAKSNAKE_HOME + "/sifs/phantompeakqualtools_20231218.sif"
  720. params:
  721. spp=config['cross_correlation']['spp'] if config['cross_correlation']['spp'] else ""
  722. threads: 4
  723. shell:
  724. """
  725. run_spp.R -p={threads} -c={input.bam} -savp={output.pdf} -out={output.tsv} -rf {params.spp} 1>{log} 2>&1
  726. """
  727. rule cross_correlation_summary:
  728. input:
  729. expand("quality_control/cross_correlation/{replicate}.tsv", replicate=REPLICATES + INPUTS)
  730. output:
  731. "quality_control/cross_correlation_summary.tsv"
  732. shell:
  733. """
  734. echo -e "Sample\\tTotalReadsFiltered\\tFragmentLength\\tCrossCorrelation\\tPhantomPeakLocation\\tPhantomPeakCorrelation\\tMinimumCrossCorrelationShift\\tMinimumCrossCorrelationValue\\tNSC\\tRSC\\tPhantomPeakQualityTag" >{output}
  735. cat {input} | sort -k 1 >>{output}
  736. """