Snakefile 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487
  1. import os
  2. import yaml
  3. import pandas as pd
  4. configfile: "config.yaml"
  5. ##########################################################
  6. # 全局变量和样本信息
  7. ##########################################################
  8. HICSNAKE_HOME = config["HICSNAKE_HOME"] if config["HICSNAKE_HOME"] else os.getcwd()
  9. GENOME = config["genome"]
  10. RESOLUTIONS = config["resolutions"]
  11. BASE_RESOLUTION = min(RESOLUTIONS)
  12. with open('chromosomes.txt', 'r') as file:
  13. CHROMOSOMES = [line.strip() for line in file]
  14. meta_data = pd.read_csv('samples.txt', sep='\t')
  15. REPLICATES = sorted(meta_data['replicate'].tolist())
  16. SAMPLES2REPLICATES = meta_data.groupby('sample')['replicate'].apply(list).to_dict()
  17. SAMPLES = SAMPLES2REPLICATES.keys()
  18. contrasts_data = pd.read_csv('contrasts.txt', sep='\t', header=None)
  19. CONTRASTS = [f"{row[0]}_vs_{row[1]}" for row in contrasts_data.itertuples(index=False)]
  20. ##########################################################
  21. # rule all: 最终想要生成的文件
  22. ##########################################################
  23. rule all:
  24. input:
  25. #第一步: 测序数据的过滤和质控
  26. #"clean_data/fastp_summary.tsv",
  27. #第二步:比对到参考基因组
  28. #expand("aligned/{replicate}_R{i}.bam", replicate=REPLICATES, i=[1,2]),
  29. #第三步: 构建接触矩阵
  30. #expand("Matrix/Raw/{replicate}.cool", replicate=REPLICATES),
  31. #expand("Matrix/Merged/{sample}.cool", sample=SAMPLES),
  32. #expand("Matrix/Normalized/{sample}.cool", sample=SAMPLES),
  33. #第四步: 格式转换
  34. expand("Matrix/Final/{sample}.{format}", sample=SAMPLES, format=["mcool"]),
  35. #第五步: 接触矩阵质控
  36. "Matrix/Raw/hicQC.html",
  37. "Matrix/Raw/SCC.csv",
  38. expand("Matrix/Final/{sample}_res.csv", sample=SAMPLES),
  39. #第六步: AB compartment, TADs, Loops
  40. expand("Matrix/Diff/{resolution}/{contrast}.cool", resolution=config["TAD_resolution"], contrast=CONTRASTS),
  41. expand("Compartments/CALDER2/{resolution}/{sample}/sub_compartments/all_sub_compartments.bed", resolution=config["Compartment_resolution"], sample=SAMPLES),
  42. expand("TADs/HiCExplorer/{resolution}/{sample}_domains.bed", resolution=config["TAD_resolution"], sample=SAMPLES),
  43. expand("TADs/HiCExplorer/{resolution}/{contrast}_accepted.diff_tad", resolution=config["TAD_resolution"],contrast=CONTRASTS),
  44. expand("Loops/Mustache/{resolution}/{sample}.tsv", resolution=config["Loop_resolution"], sample=SAMPLES),
  45. expand("Loops/Mustache/{resolution}/{contrast}.diffloop1", resolution=config["Loop_resolution"], contrast=CONTRASTS),
  46. # 可视化
  47. expand("plots/{resolution}/{sample}_plotmatrix_allchr.png", resolution=config["Plot_matrix_resolution"], sample=SAMPLES),
  48. expand("plots/{resolution}/{sample}_plotmatrix_perchr.png", resolution=config["Plot_matrix_resolution"], sample=SAMPLES)
  49. #********************************************************#
  50. # 第一步: 测序数据的过滤和质控 #
  51. #********************************************************#
  52. # 使用fastp进行原始数据质控和过滤,自动判断 SE?PE?
  53. rule fastp_quality_control:
  54. input:
  55. read1="raw_data/{replicate}_R1.fastq.gz",
  56. read2="raw_data/{replicate}_R2.fastq.gz"
  57. output:
  58. read1="clean_data/{replicate}_R1.fastq.gz",
  59. read2="clean_data/{replicate}_R2.fastq.gz",
  60. html_report="clean_data/{replicate}_fastp.html",
  61. json_report="clean_data/{replicate}_fastp.json"
  62. log: "logs/{replicate}_fastp_quality_control.log"
  63. threads: 3
  64. singularity:
  65. HICSNAKE_HOME + "/sifs/commonTools_20240327.sif"
  66. params:
  67. fastp=config["fastp"]
  68. shell:
  69. """
  70. fastp -i {input.read1} -o {output.read1} -I {input.read2} -O {output.read2} --html {output.html_report} --json {output.json_report} --thread {threads} {params.fastp} 1>{log} 2>&1
  71. """
  72. # 合并fastp质控结果生成表格
  73. rule combine_fastp_reports:
  74. input:
  75. expand("clean_data/{replicate}_fastp.json", replicate=REPLICATES)
  76. output:
  77. "clean_data/fastp_summary.tsv"
  78. log: "logs/combine_fastp_reports.log"
  79. params:
  80. hicsnake_home=HICSNAKE_HOME
  81. shell:
  82. """
  83. Rscript {params.hicsnake_home}/scripts/combine_fastp_report.R --input {input} --output clean_data/ 1>{log} 2>&1
  84. """
  85. #********************************************************#
  86. # 第二步:比对到参考基因组 #
  87. #********************************************************#
  88. # 根据基因组构建 bwa index
  89. rule bwa_index:
  90. input:
  91. genome = "ref/" + GENOME + ".fa"
  92. output:
  93. genome_index = "ref/genome.bwt"
  94. log: "logs/bwa_index.log"
  95. singularity:
  96. HICSNAKE_HOME + "/sifs/commonTools_20240327.sif"
  97. threads: 6
  98. shell:
  99. "bwa index -p ref/genome {input.genome} 1>{log} 2>&1"
  100. # 使用 bwa 比对
  101. rule bwa:
  102. input:
  103. genome_index="ref/genome.bwt",
  104. read="clean_data/{replicate}_R{i}.fastq.gz",
  105. output:
  106. sam="aligned/{replicate}_R{i}.sam"
  107. threads: 20
  108. log: "logs/{replicate}_R{i}_bwa.log"
  109. singularity:
  110. HICSNAKE_HOME + "/sifs/commonTools_20240327.sif"
  111. shell:
  112. """
  113. bwa mem -A1 -B4 -E50 -L0 -t {threads} -o {output.sam} ref/genome {input.read} 2>{log}
  114. """
  115. rule sam2bam:
  116. input:
  117. sam="aligned/{replicate}_R{i}.sam"
  118. output:
  119. bam="aligned/{replicate}_R{i}.bam"
  120. threads: 6
  121. log: "logs/{replicate}_R{i}_sam2bam.log"
  122. singularity:
  123. HICSNAKE_HOME + "/sifs/commonTools_20240327.sif"
  124. shell:
  125. """
  126. samtools view -hb --threads {threads} -o {output.bam} {input.sam}
  127. """
  128. #********************************************************#
  129. # 第三步: 构建接触矩阵 #
  130. #********************************************************#
  131. # 生成酶切位点
  132. rule hicFindRestSite:
  133. input:
  134. genome="ref/" + GENOME + ".fa"
  135. output:
  136. rest_site="ref/rest_site.bed"
  137. log: "logs/hicFindRestSite.log"
  138. params:
  139. restriction_sequence=config["restriction_sequence"]
  140. singularity:
  141. HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
  142. shell:
  143. """
  144. hicFindRestSite --fasta {input.genome} --searchPattern {params.restriction_sequence} -o {output.rest_site} 1>{log} 2>&1
  145. """
  146. # 生成 cool 格式矩阵 hicBuildMatrix
  147. rule hicBuildMatrix:
  148. input:
  149. read1_bam="aligned/{replicate}_R1.bam",
  150. read2_bam="aligned/{replicate}_R2.bam",
  151. rest_site="ref/rest_site.bed"
  152. output:
  153. bam="Matrix/Raw/{replicate}.bam",
  154. cool="Matrix/Raw/{replicate}.cool",
  155. log="Matrix/Raw/{replicate}/QC.log"
  156. log: "logs/{replicate}_hicBuildMatrix.log"
  157. params:
  158. genome="ref/" + GENOME + ".fa",
  159. base_resolution=BASE_RESOLUTION,
  160. hicBuildMatrix=config["hicBuildMatrix"]
  161. singularity:
  162. HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
  163. threads: 6
  164. shell:
  165. """
  166. hicBuildMatrix --samFiles {input.read1_bam} {input.read2_bam} \
  167. --genomeAssembly {params.genome} \
  168. --outBam {output.bam} \
  169. --outFileName {output.cool} \
  170. --QCfolder Matrix/Raw/{wildcards.replicate} \
  171. --restrictionCutFile {input.rest_site} \
  172. --binSize {params.base_resolution} \
  173. --threads {threads} {params.hicBuildMatrix} 1>{log} 2>&1
  174. """
  175. # 合并生物学重复的矩阵
  176. rule hicSumMatrices:
  177. input:
  178. cools=lambda wildcards: expand("Matrix/Raw/{replicate}.cool", replicate=SAMPLES2REPLICATES[wildcards.sample])
  179. output:
  180. cool="Matrix/Merged/{sample}.cool"
  181. log: "logs/{sample}_hicSumMatrices.log"
  182. singularity:
  183. HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
  184. threads: 3
  185. shell:
  186. """
  187. if [ $(echo {input.cools} | wc -w) -gt 1 ]; then
  188. hicSumMatrices --matrices {input.cools} --outFileName {output.cool} 1>{log} 2>&1
  189. else
  190. cp {input.cools[0]} {output.cool} 1>{log} 2>&1
  191. fi
  192. """
  193. # 归一化接触矩阵
  194. rule hicNormalize:
  195. input:
  196. cools=expand("Matrix/Merged/{sample}.cool", sample=SAMPLES, allow_missing=True)
  197. output:
  198. cools=expand("Matrix/Normalized/{sample}.cool", sample=SAMPLES, allow_missing=True)
  199. params:
  200. hicNormalize=config["hicNormalize"]
  201. singularity:
  202. HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
  203. threads: 1
  204. shell:
  205. """
  206. if [ $(echo {input.cools} | wc -w) -gt 1 ]; then
  207. hicNormalize --matrices {input.cools} --outFileName {output.cools} {params.hicNormalize}
  208. else
  209. cp {input.cools[0]} {output.cools[0]}
  210. fi
  211. """
  212. #********************************************************#
  213. # 第四步: 格式转换 #
  214. #********************************************************#
  215. # 转换为 mcool 格式
  216. rule cooler_zoomify:
  217. input:
  218. cool="Matrix/Normalized/{sample}.cool"
  219. output:
  220. mcool="Matrix/Final/{sample}.mcool"
  221. log: "logs/{sample}_cooler_zoomify.log"
  222. params:
  223. resolutions=",".join(map(str, RESOLUTIONS))
  224. singularity:
  225. HICSNAKE_HOME + "/sifs/cooler_20240402.sif"
  226. threads: 4
  227. shell:
  228. """
  229. cooler zoomify {input.cool} -n {threads} --resolutions {params.resolutions} -o Matrix/Final/{wildcards.sample}_nobalanced.mcool
  230. cooler zoomify {input.cool} -n {threads} --balance --resolutions {params.resolutions} -o {output.mcool}
  231. """
  232. # 转换为 ginteractions 格式
  233. rule convert_to_ginteractions:
  234. input:
  235. cool="Matrix/Normalized/{sample}.cool"
  236. output:
  237. ginteractions="Matrix/Final/{sample}.ginteractions.tsv"
  238. log: "logs/{sample}_hicConvertFormat_ginteractions.log"
  239. params:
  240. resolutions=RESOLUTIONS,
  241. prefix="Matrix/Final/{sample}.ginteractions"
  242. singularity:
  243. HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
  244. threads: 1
  245. shell:
  246. """
  247. hicConvertFormat --matrices {input.cool} --outFileName {params.prefix} --inputFormat cool --outputFormat ginteractions 1>{log} 2>&1
  248. """
  249. # 转换为 hic 格式
  250. rule convert_to_hic:
  251. input:
  252. ginteractions="Matrix/Final/{sample}.ginteractions.tsv",
  253. chrom_sizes = "ref/" + GENOME + ".chrom.sizes"
  254. output:
  255. hic="Matrix/Final/{sample}.hic"
  256. log: "logs/{sample}_convert_to_hic.log"
  257. params:
  258. resolutions=",".join(str(r) for r in RESOLUTIONS)
  259. singularity:
  260. HICSNAKE_HOME + "/sifs/juicer_20240401"
  261. threads: 1
  262. shell:
  263. """
  264. awk -F "\\t" '{{print 0, $1, $2, 0, 0, $4, $5, 1, $7}}' {input.ginteractions} > Matrix/Final/{wildcards.sample}.short
  265. sort -k2,2d -k6,6d Matrix/Final/{wildcards.sample}.short > Matrix/Final/{wildcards.sample}.short.sorted
  266. java -Xms6g -Xmx40g -jar /opt/juicer_tools.jar pre -r {params.resolutions} Matrix/Final/{wildcards.sample}.short.sorted {output.hic} {input.chrom_sizes}
  267. """
  268. #********************************************************#
  269. # 第五步: 接触矩阵质控 #
  270. #********************************************************#
  271. # 使用 hicQC 进行REPLICATE 层次质控
  272. rule hicQC:
  273. input:
  274. logs=expand("Matrix/Raw/{replicate}/QC.log", replicate=REPLICATES)
  275. output:
  276. html="Matrix/Raw/hicQC.html"
  277. params:
  278. REPLICATES=REPLICATES
  279. singularity:
  280. HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
  281. threads: 20
  282. shell:
  283. """
  284. hicQC --logfiles {input.logs} --labels {params.REPLICATES} --outputFolder Matrix/Raw
  285. """
  286. # 计算 SCC
  287. rule hicrep:
  288. input:
  289. cools=expand("Matrix/Raw/{replicate}.cool", replicate=REPLICATES)
  290. output:
  291. scc="Matrix/Raw/SCC.csv"
  292. params:
  293. replicates=REPLICATES,
  294. hicrep=config["hicrep"]
  295. singularity:
  296. HICSNAKE_HOME + "/sifs/hicrep_20240423.sif"
  297. threads: 4
  298. shell:
  299. """
  300. python scripts/scc.py --workers {threads} {params.hicrep} --output {output.scc} --sample_names {params.replicates} --files {input.cools}
  301. """
  302. # 评估数据分辨率
  303. rule estimate_resolution:
  304. input:
  305. mcool="Matrix/Final/{sample}.mcool"
  306. output:
  307. res="Matrix/Final/{sample}_res.csv"
  308. params:
  309. hicres=config["hicres"]
  310. singularity:
  311. HICSNAKE_HOME + "/sifs/cooler_20240402.sif"
  312. threads: 4
  313. shell:
  314. """
  315. python scripts/hicres.py {input.mcool} --threads {threads} --output {output.res} {params.hicres}
  316. """
  317. # hicPlotDistVsCounts: 距离衰减曲线, 查看样本/重复差异
  318. rule hicPlotDistVsCounts:
  319. input:
  320. cool=expand("Matrix/Raw/{replicate}.cool", replicate=REPLICATES)
  321. output:
  322. pdf="Matrix/Raw/DistVsCounts.pdf"
  323. params:
  324. replicates=REPLICATES,
  325. hicPlotDistVsCounts=config["hicPlotDistVsCounts"]
  326. singularity:
  327. HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
  328. threads: 2
  329. shell:
  330. """
  331. hicPlotDistVsCounts --matrices {input.cool} --labels {params.replicates} --plotFile {output.pdf} {params.hicPlotDistVsCounts}
  332. """
  333. #********************************************************#
  334. # 第七步: 三维结构预测 #
  335. #********************************************************#
  336. rule calder2:
  337. input:
  338. mcool="Matrix/Final/{sample}.mcool",
  339. output:
  340. "Compartments/CALDER2/{resolution}/{sample}/sub_compartments/all_sub_compartments.bed"
  341. params:
  342. calder=config["calder"],
  343. resolution="{resolution}"
  344. threads: 4
  345. log: "logs/{sample}_{resolution}_calder2.log"
  346. shell:
  347. """
  348. Rscript scripts/calder -i {input.mcool} -t mcool -b {params.resolution} -p {threads} -o Compartments/CALDER2/{wildcards.resolution}/{wildcards.sample} {params.calder}
  349. """
  350. # 使用 HiCExplorer 检测 TADs
  351. rule hicFindTADs:
  352. input:
  353. mcool="Matrix/Final/{sample}.mcool"
  354. output:
  355. boundaries="TADs/HiCExplorer/{resolution}/{sample}_boundaries.bed",
  356. domains="TADs/HiCExplorer/{resolution}/{sample}_domains.bed",
  357. score="TADs/HiCExplorer/{resolution}/{sample}_score.bedgraph"
  358. params:
  359. hicFindTADs=config["hicFindTADs"],
  360. prefix="TADs/HiCExplorer/{resolution}/{sample}",
  361. resolution="{resolution}"
  362. singularity:
  363. HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
  364. threads: 4
  365. shell:
  366. """
  367. hicFindTADs -p {threads} --matrix {input.mcool}::/resolutions/{params.resolution} --outPrefix {params.prefix} {params.hicFindTADs}
  368. """
  369. # 使用 HiCExplorer 检测差异 TADs
  370. rule hicDifferentialTAD:
  371. input:
  372. treatment_mcool="Matrix/Final/{treatment}.mcool",
  373. control_mcool="Matrix/Final/{control}.mcool",
  374. treatment_tads="TADs/HiCExplorer/{resolution}/{treatment}_domains.bed"
  375. output:
  376. accepted_diff_tad="TADs/HiCExplorer/{resolution}/{treatment}_vs_{control}_accepted.diff_tad"
  377. params:
  378. hicDifferentialTAD=config["hicDifferentialTAD"],
  379. prefix="TADs/HiCExplorer/{resolution}/{treatment}_vs_{control}",
  380. resolution="{resolution}"
  381. singularity:
  382. HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
  383. threads: 4
  384. shell:
  385. """
  386. hicDifferentialTAD --targetMatrix {input.treatment_mcool}::/resolutions/{params.resolution} --controlMatrix {input.control_mcool}::/resolutions/{params.resolution} --tadDomains {input.treatment_tads} --outFileNamePrefix {params.prefix} {params.hicDifferentialTAD} --threads {threads}
  387. """
  388. # 使用 mustache 检测 Loops
  389. rule mustache:
  390. input:
  391. mcool="Matrix/Final/{sample}.mcool"
  392. output:
  393. loops="Loops/Mustache/{resolution}/{sample}.tsv"
  394. params:
  395. mustache=config["mustache"],
  396. resolution="{resolution}"
  397. singularity:
  398. HICSNAKE_HOME + "/sifs/mustache_20240410.sif"
  399. threads: 6
  400. shell:
  401. """
  402. mustache --file {input.mcool} --resolution {params.resolution} --outfile {output.loops} --processes {threads} {params.mustache}
  403. """
  404. # 使用 mustache 检测差异 Loops
  405. rule diff_mustache:
  406. input:
  407. treatment_mcool="Matrix/Final/{treatment}.mcool",
  408. control_mcool="Matrix/Final/{control}.mcool",
  409. output:
  410. diffloop1="Loops/Mustache/{resolution}/{treatment}_vs_{control}.diffloop1",
  411. diffloop2="Loops/Mustache/{resolution}/{treatment}_vs_{control}.diffloop2"
  412. params:
  413. diff_mustache=config["diff_mustache"],
  414. prefix="Loops/Mustache/{resolution}/{treatment}_vs_{control}",
  415. resolution="{resolution}"
  416. singularity:
  417. HICSNAKE_HOME + "/sifs/mustache_20240410.sif"
  418. threads: 6
  419. shell:
  420. """
  421. python /opt/conda/lib/python3.12/site-packages/mustache/diff_mustache.py -f1 {input.treatment_mcool} -f2 {input.control_mcool} --resolution {params.resolution} --outfile {params.prefix} --processes {threads} {params.diff_mustache}
  422. """
  423. #********************************************************#
  424. # 可视化 #
  425. #********************************************************#
  426. rule hicPlotMatrix:
  427. input:
  428. mcool="Matrix/Final/{sample}.mcool"
  429. output:
  430. allchr_png="plots/{resolution}/{sample}_plotmatrix_allchr.png",
  431. perchr_png="plots/{resolution}/{sample}_plotmatrix_perchr.png"
  432. params:
  433. resolution="{resolution}"
  434. shell:
  435. """
  436. hicPlotMatrix -m {input.mcool}::/resolutions/{params.resolution} -o {output.allchr_png} --log1p --colorMap YlOrRd --clearMaskedBins --log1p
  437. hicPlotMatrix -m {input.mcool}::/resolutions/{params.resolution} -o {output.perchr_png} --log1p --colorMap YlOrRd --clearMaskedBins --perChromosome --log1p
  438. """
  439. # 使用 hicCompareMatrices 比较矩阵
  440. rule hicCompareMatrices:
  441. input:
  442. treatment_mcool="Matrix/Final/{treatment}.mcool",
  443. control_mcool="Matrix/Final/{control}.mcool",
  444. output:
  445. cool="Matrix/Diff/{resolution}/{treatment}_vs_{control}.cool"
  446. params:
  447. resolution="{resolution}"
  448. singularity:
  449. HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
  450. threads: 6
  451. shell:
  452. """
  453. hicCompareMatrices --matrices {input.treatment_mcool}::resolutions/{params.resolution} {input.control_mcool}::resolutions/{params.resolution} --operation log2ratio -o {output.cool}
  454. """